Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initialize metrics with labels #2162

Merged
merged 28 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
269a1cc
metrics: Add comments about metrics naming
lambdanis Feb 23, 2024
af88716
errormetrics: Define error types as integers not strings
lambdanis Feb 23, 2024
d2724f8
errormetrics: Use ops.OpCode type instead of basic ints
lambdanis Feb 24, 2024
187592b
api/ops: Define OpCodeStrings map with opcode string values
lambdanis Feb 26, 2024
fea557c
api/ops: Add missing OpCode values
lambdanis Feb 26, 2024
623c564
errormetrics, observer: Define error_type label values
lambdanis Feb 26, 2024
75ab0f9
errormetrics: Initialize metrics with labels
lambdanis Feb 23, 2024
e53dc6c
eventcachemetrics: Define entry_type label values
lambdanis Feb 23, 2024
93bcbfc
eventcachemetrics: Use tetragon.EventType as event_type label
lambdanis Feb 26, 2024
decdb63
eventcachemetrics: Add event_type label to errors metric
lambdanis Feb 26, 2024
fbacf4b
eventcachemetrics: Define error label values
lambdanis Feb 26, 2024
a7918f1
eventcachemetrics: Initialize metrics with labels
lambdanis Feb 23, 2024
2e6e219
reader/exec: Define FlagStrings map with flag string values
lambdanis Feb 26, 2024
546bf72
eventmetrics: Initialize metrics with labels
lambdanis Feb 28, 2024
6e55729
kprobemetrics: Define curr_type and prev_type labels values
lambdanis Feb 26, 2024
d90fc84
opcodemetrics: Use ops.OpCode type instead of basic ints
lambdanis Feb 26, 2024
efe5cd7
opcodemetrics: Initialize metrics with labels
lambdanis Feb 26, 2024
07580f3
policyfiltermetrics: Define subsys label values
lambdanis Feb 27, 2024
3f0f378
policyfiltermetrics: Define op label values
lambdanis Feb 27, 2024
4262ed8
policyfiltermetrics: Remove error label
lambdanis Feb 27, 2024
37739a5
policyfiltermetrics: Initialize metrics with labels
lambdanis Feb 27, 2024
d2f5d01
Remove pkg/metrics/processexecmetrics
lambdanis Feb 27, 2024
20cfea2
watchermetrics: Define watcher label values
lambdanis Feb 27, 2024
4f29532
watchermetrics: Initialize metrics with labels
lambdanis Feb 27, 2024
5cdc9b4
observer: Define op as integers not strings
lambdanis Feb 28, 2024
415e68e
observer: Initialize metrics with labels
lambdanis Feb 27, 2024
f5f9629
tracing: Initialize metrics with labels
lambdanis Feb 27, 2024
b24e3f1
api/ops: Make OpCode.String() more informative for unknown opcodes
lambdanis Feb 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ require (
github.com/mennanov/fieldmask-utils v1.1.0
github.com/opencontainers/runtime-spec v1.2.0
github.com/pelletier/go-toml v1.9.5
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.19.0
github.com/prometheus/client_model v0.6.0
github.com/sirupsen/logrus v1.9.3
Expand Down Expand Up @@ -149,6 +148,7 @@ require (
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/prometheus/common v0.48.0 // indirect
Expand Down
36 changes: 25 additions & 11 deletions pkg/api/ops/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,35 @@ const (
MsgOpKfreeSkb = 11
MsgOpGenericKprobe = 13
MsgOpGeneric_Tracepoint = 14
MsgOpGenericUprobe = 15
MsgOpClone = 23
MsgOpData = 24
MsgOpCgroup = 25
MsgOpLoader = 26
MsgOpTest = 254
)

var OpCodeStrings = map[OpCode]string{
MsgOpUndef: "Undef",
MsgOpExecve: "Execve",
MsgOpExit: "Exit",
MsgOpKfreeSkb: "KfreeSkb",
MsgOpGenericKprobe: "GenericKprobe",
MsgOpGeneric_Tracepoint: "GenericTracepoint",
MsgOpGenericUprobe: "GenericUprobe",
MsgOpClone: "Clone",
MsgOpData: "Data",
MsgOpCgroup: "Cgroup",
MsgOpLoader: "Loader",
MsgOpTest: "Test",
}

func (op OpCode) String() string {
return [...]string{
0: "Undef",
5: "Execve",
7: "Exit",
13: "GenericKprobe",
14: "GenericTracepoint",
23: "Clone",
24: "Data",
25: "Cgroup",
254: "Test",
}[op]
s, ok := OpCodeStrings[op]
if !ok {
return ""
lambdanis marked this conversation as resolved.
Show resolved Hide resolved
}
return s
}

func (op CgroupOpCode) String() string {
Expand Down
6 changes: 3 additions & 3 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ func (ec *Cache) handleEvents() {
continue
}
if errors.Is(err, ErrFailedToGetParentInfo) {
eventcachemetrics.ParentInfoError(notify.EventTypeString(event.event)).Inc()
eventcachemetrics.ParentInfoError(notify.EventType(event.event)).Inc()
} else if errors.Is(err, ErrFailedToGetProcessInfo) {
eventcachemetrics.ProcessInfoError(notify.EventTypeString(event.event)).Inc()
eventcachemetrics.ProcessInfoError(notify.EventType(event.event)).Inc()
} else if errors.Is(err, ErrFailedToGetPodInfo) {
eventcachemetrics.PodInfoError(notify.EventTypeString(event.event)).Inc()
eventcachemetrics.PodInfoError(notify.EventType(event.event)).Inc()
}
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/grpc/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func GetProcessExec(event *MsgExecveEventUnix, useCache bool) *tetragon.ProcessE
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessExec: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down Expand Up @@ -394,7 +394,7 @@ func GetProcessExit(event *MsgExitEventUnix) *tetragon.ProcessExit {
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessExit: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/grpc/tracing/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ var (

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(LoaderStats)

// NOTES:
// * Rename process_loader_stats metric (to e.g. process_loader_events_total) and count label (to e.g. event)?
Comment on lines +28 to +29
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to link GH issues here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I think so, just I'm not 100% sure which of these points are worth implementing and which not. Many of these are technically breaking changes, but also many metrics are probably used only by Tetragon developers so 🤷‍♀️ I need one more pass over the metrics and maybe another pair of eyes, then I'll open issues for improvements.

}

type LoaderType int
Expand Down
30 changes: 14 additions & 16 deletions pkg/grpc/tracing/tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ func GetProcessKprobe(event *MsgGenericKprobeUnix) *tetragon.ProcessKprobe {
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessKprobe: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down Expand Up @@ -497,7 +497,7 @@ func (msg *MsgGenericTracepointUnix) HandleMessage() *tetragon.GetEventsResponse
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessTracepoint: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down Expand Up @@ -615,28 +615,26 @@ func GetProcessLoader(msg *MsgProcessLoaderUnix) *tetragon.ProcessLoader {
tetragonProcess = process.UnsafeGetProcess()
}

notifyEvent := &ProcessLoaderNotify{
ProcessLoader: tetragon.ProcessLoader{
Process: tetragonProcess,
Path: msg.Path,
Buildid: msg.Buildid,
},
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessLoader: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(notifyEvent)).Inc()
return nil
}

if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonProcess) || (tetragonProcess.Pid.Value > 1)) {
tetragonEvent := &ProcessLoaderNotify{}
tetragonEvent.Process = tetragonProcess
tetragonEvent.Path = msg.Path
tetragonEvent.Buildid = msg.Buildid
ec.Add(nil, tetragonEvent, msg.Msg.Common.Ktime, msg.Msg.ProcessKey.Ktime, msg)
ec.Add(nil, notifyEvent, msg.Msg.Common.Ktime, msg.Msg.ProcessKey.Ktime, msg)
return nil
}

tetragonEvent := &tetragon.ProcessLoader{
Process: tetragonProcess,
Path: msg.Path,
Buildid: msg.Buildid,
}

return tetragonEvent
return &notifyEvent.ProcessLoader
}

func (msg *MsgProcessLoaderUnix) Notify() bool {
Expand Down Expand Up @@ -735,7 +733,7 @@ func GetProcessUprobe(event *MsgGenericUprobeUnix) *tetragon.ProcessUprobe {
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessUprobe: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down
88 changes: 69 additions & 19 deletions pkg/metrics/errormetrics/errormetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,62 @@ package errormetrics

import (
"fmt"
"strings"

"github.com/cilium/tetragon/pkg/api/ops"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)

type ErrorType string
type ErrorType int

var (
const (
// Process not found on get() call.
ProcessCacheMissOnGet ErrorType = "process_cache_miss_on_get"
ProcessCacheMissOnGet ErrorType = iota
// Process evicted from the cache.
ProcessCacheEvicted ErrorType = "process_cache_evicted"
ProcessCacheEvicted
// Process not found on remove() call.
ProcessCacheMissOnRemove ErrorType = "process_cache_miss_on_remove"
ProcessCacheMissOnRemove
// Tid and Pid mismatch that could affect BPF and user space caching logic
ProcessPidTidMismatch ErrorType = "process_pid_tid_mismatch"
ProcessPidTidMismatch
// An event is missing process info.
EventMissingProcessInfo ErrorType = "event_missing_process_info"
EventMissingProcessInfo
// An error occurred in an event handler.
HandlerError ErrorType = "handler_error"
HandlerError
// An event finalizer on Process failed
EventFinalizeProcessInfoFailed ErrorType = "event_finalize_process_info_failed"
EventFinalizeProcessInfoFailed
)

var errorTypeLabelValues = map[ErrorType]string{
ProcessCacheMissOnGet: "process_cache_miss_on_get",
ProcessCacheEvicted: "process_cache_evicted",
ProcessCacheMissOnRemove: "process_cache_miss_on_remove",
ProcessPidTidMismatch: "process_pid_tid_mismatch",
EventMissingProcessInfo: "event_missing_process_info",
HandlerError: "handler_error",
EventFinalizeProcessInfoFailed: "event_finalize_process_info_failed",
}

func (e ErrorType) String() string {
return errorTypeLabelValues[e]
}
Comment on lines +43 to +45
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have a fallback here for missing types to avoid panics?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't panic, in case e is not in errorTypeLabelValues it will return an empty string (zero value).


type EventHandlerError int

// TODO: Recognize different errors returned by individual handlers
const (
HandlePerfUnknownOp EventHandlerError = iota
HandlePerfHandlerError
)

var eventHandlerErrorLabelValues = map[EventHandlerError]string{
HandlePerfUnknownOp: "unknown_opcode",
HandlePerfHandlerError: "event_handler_failed",
}

func (e EventHandlerError) String() string {
return eventHandlerErrorLabelValues[e]
}

var (
ErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Expand All @@ -50,24 +80,44 @@ var (
func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(ErrorTotal)
registry.MustRegister(HandlerErrors)

// Initialize metrics with labels
for er := range errorTypeLabelValues {
GetErrorTotal(er).Add(0)
}
for opcode := range ops.OpCodeStrings {
if opcode != ops.MsgOpUndef && opcode != ops.MsgOpTest {
GetHandlerErrors(opcode, HandlePerfHandlerError).Add(0)
}
}
// NB: We initialize only ops.MsgOpUndef here, but unknown_opcode can occur for any opcode
// that is not explicitly handled.
GetHandlerErrors(ops.MsgOpUndef, HandlePerfUnknownOp).Add(0)

// NOTES:
// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
// * error, error_type, type - standardize on a label
// * Delete errors_total{type="handler_error"} - it duplicates handler_errors_total
// * Consider further splitting errors_total
// * Rename handler_errors_total to event_handler_errors_total?
}

// Get a new handle on an ErrorTotal metric for an ErrorType
func GetErrorTotal(t ErrorType) prometheus.Counter {
return ErrorTotal.WithLabelValues(string(t))
func GetErrorTotal(er ErrorType) prometheus.Counter {
return ErrorTotal.WithLabelValues(er.String())
}

// Increment an ErrorTotal for an ErrorType
func ErrorTotalInc(t ErrorType) {
GetErrorTotal(t).Inc()
func ErrorTotalInc(er ErrorType) {
GetErrorTotal(er).Inc()
}

// Get a new handle on the HandlerErrors metric
func GetHandlerErrors(opcode int, err error) prometheus.Counter {
return HandlerErrors.WithLabelValues(fmt.Sprint(opcode), strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", ""))
func GetHandlerErrors(opcode ops.OpCode, er EventHandlerError) prometheus.Counter {
return HandlerErrors.WithLabelValues(fmt.Sprint(int32(opcode)), er.String())
}

// Increment the HandlerErrors metric
func HandlerErrorsInc(opcode int, err error) {
GetHandlerErrors(opcode, err).Inc()
func HandlerErrorsInc(opcode ops.OpCode, er EventHandlerError) {
GetHandlerErrors(opcode, er).Inc()
}
Loading