diff --git a/internal/fs/wrappers/monitoring.go b/internal/fs/wrappers/monitoring.go index db99866e80..13cdcd1a52 100644 --- a/internal/fs/wrappers/monitoring.go +++ b/internal/fs/wrappers/monitoring.go @@ -62,7 +62,7 @@ func init() { Measure: opsErrorCount, Description: "The cumulative number of errors generated by file system operations", Aggregation: view.Sum(), - TagKeys: []tag.Key{tags.FSOp, tags.FSErrCategory}, + TagKeys: []tag.Key{tags.FSOp, tags.FSError, tags.FSErrCategory}, }, &view.View{ Name: "fs/ops_latency", @@ -75,17 +75,24 @@ func init() { } } -// categorize maps an error to an error-category. -// This helps reduce the cardinality of the labels to less than 30. -// This lower number of errors allows the various errors to get piped to Cloud metrics without getting dropped. -func categorize(err error) string { +// errStrAndCategory maps an error to an error string and an error category. +// Uncommon errors are bucketed into categories to reduce the cardinality of the +// error so that the metric is not rejected by Cloud Monarch. +func errStrAndCategory(err error) (str string, category string) { if err == nil { - return "" + return "", "" } var errno syscall.Errno if !errors.As(err, &errno) { errno = DefaultFSError } + return errno.Error(), errCategory(errno) +} + +// errCategory maps an error to an error-category. +// This helps reduce the cardinality of the labels to less than 30. +// This lower number of errors allows the various errors to get piped to Cloud metrics without getting dropped. +func errCategory(errno syscall.Errno) string { switch errno { case syscall.ELNRNG, syscall.ENODEV, @@ -254,6 +261,7 @@ func categorize(err error) string { // Records file system operation count, failed operation count and the operation latency. func recordOp(ctx context.Context, method string, start time.Time, fsErr error) { + // Recording opCount. if err := stats.RecordWithTags( ctx, @@ -268,11 +276,12 @@ func recordOp(ctx context.Context, method string, start time.Time, fsErr error) // Recording opErrorCount. if fsErr != nil { - errCategory := categorize(fsErr) + errStr, errCategory := errStrAndCategory(fsErr) if err := stats.RecordWithTags( ctx, []tag.Mutator{ tag.Upsert(tags.FSOp, method), + tag.Upsert(tags.FSError, errStr), tag.Upsert(tags.FSErrCategory, errCategory), }, opsErrorCount.M(1), diff --git a/internal/fs/wrappers/monitoring_test.go b/internal/fs/wrappers/monitoring_test.go index ab67d2a407..0f5738c902 100644 --- a/internal/fs/wrappers/monitoring_test.go +++ b/internal/fs/wrappers/monitoring_test.go @@ -33,66 +33,82 @@ func TestFsErrStrAndCategory(t *testing.T) { t.Parallel() tests := []struct { fsErr error + expectedStr string expectedCategory string }{ { fsErr: fmt.Errorf("some random error"), + expectedStr: "input/output error", expectedCategory: "input/output error", }, { fsErr: syscall.ENOTEMPTY, + expectedStr: "directory not empty", expectedCategory: "directory not empty", }, { fsErr: syscall.EEXIST, + expectedStr: "file exists", expectedCategory: "file exists", }, { fsErr: syscall.EINVAL, + expectedStr: "invalid argument", expectedCategory: "invalid argument", }, { fsErr: syscall.EINTR, + expectedStr: "interrupted system call", expectedCategory: "interrupt errors", }, { fsErr: syscall.ENOSYS, + expectedStr: "function not implemented", expectedCategory: "function not implemented", }, { fsErr: syscall.ENOSPC, + expectedStr: "no space left on device", expectedCategory: "process/resource management errors", }, { fsErr: syscall.E2BIG, + expectedStr: "argument list too long", expectedCategory: "invalid operation", }, { fsErr: syscall.EHOSTDOWN, + expectedStr: "host is down", expectedCategory: "network errors", }, { fsErr: syscall.ENODATA, + expectedStr: "no data available", expectedCategory: "miscellaneous errors", }, { fsErr: syscall.ENODEV, + expectedStr: "no such device", expectedCategory: "device errors", }, { fsErr: syscall.EISDIR, + expectedStr: "is a directory", expectedCategory: "file/directory errors", }, { fsErr: syscall.ENOSYS, + expectedStr: "function not implemented", expectedCategory: "function not implemented", }, { fsErr: syscall.ENFILE, + expectedStr: "too many open files in system", expectedCategory: "too many open files", }, { fsErr: syscall.EPERM, + expectedStr: "operation not permitted", expectedCategory: "permission errors", }, } @@ -101,7 +117,10 @@ func TestFsErrStrAndCategory(t *testing.T) { t.Run(fmt.Sprintf("fsErrStrAndCategor_case_%d", idx), func(t *testing.T) { t.Parallel() - assert.Equal(t, tc.expectedCategory, categorize(tc.fsErr)) + actualErrStr, actualErrGrp := errStrAndCategory(tc.fsErr) + + assert.Equal(t, tc.expectedStr, actualErrStr) + assert.Equal(t, tc.expectedCategory, actualErrGrp) }) } } diff --git a/internal/monitor/tags/tags.go b/internal/monitor/tags/tags.go index 77592afa6b..915337beaa 100644 --- a/internal/monitor/tags/tags.go +++ b/internal/monitor/tags/tags.go @@ -29,6 +29,9 @@ var ( // FSOp annotates the file system op processed. FSOp = tag.MustNewKey("fs_op") + // FSError annotates the file system failed operations with the error type + FSError = tag.MustNewKey("fs_error") + // FSErrCategory reduces the cardinality of FSError by grouping errors together. FSErrCategory = tag.MustNewKey("fs_error_category")