Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: changing health status thresholds #1905

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions pkg/daemon/server/service/health_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ const (
)

// HealthThresholds are the thresholds used to compute the health of a vertex
const (
var (
// criticalBufferThreshold is the threshold above which the health of a vertex is critical
criticalBufferThreshold = 95
criticalBufferThreshold = v1alpha1.DefaultBufferUsageLimit * float64(90) // 95
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The threshold for different vertex should be different - based on the vertex limits configuration, is this right? @kohlisid

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes now that we have it relative, it can vary vertex to vertex.
The code will need changes to compare each timeline using the limits for the corresponding vertex limits.

// warningBufferThreshold is the threshold above which the health of a vertex is warning
warningBufferThreshold = 80
warningBufferThreshold = float64(0.85) * criticalBufferThreshold // 80
)

// Dataflow states
Expand Down Expand Up @@ -231,6 +231,7 @@ func (hc *HealthChecker) getPipelineVertexDataCriticality(ctx context.Context) (
}
// update the usage timeline for all the ISBs used in the pipeline
hc.updateUsageTimeline(buffers.Buffers)
hc.udpateThresholds()

var vertexState []*vertexState

Expand All @@ -255,6 +256,15 @@ func (hc *HealthChecker) getPipelineVertexDataCriticality(ctx context.Context) (
return vertexState, nil
}

func (hc *HealthChecker) udpateThresholds() {
criticalBufferThreshold = float64(*hc.pipeline.Spec.Limits.BufferUsageLimit) * 0.9
warningBufferThreshold = criticalBufferThreshold * 0.85
}

func (hc *HealthChecker) GetThresholds() (float64, float64) {
return criticalBufferThreshold, warningBufferThreshold
}

// updateUsageTimeline is used to update the usage timeline for a given buffer list
// This iterates over all the buffers in the buffer list and updates the usage timeline for each buffer
// The timeline data is represented as a map of buffer name to a list of timelineEntry
Expand Down Expand Up @@ -348,9 +358,9 @@ func calculateEWMAUsage(bufferUsage []float64) []float64 {
func assignStateToBufferUsage(ewmaValue float64) string {
// Assign the state to the buffer usage
var state string
if ewmaValue > criticalBufferThreshold {
if ewmaValue > float64(criticalBufferThreshold) {
state = criticalState
} else if ewmaValue > warningBufferThreshold {
} else if ewmaValue > float64(warningBufferThreshold) {
state = warningState
} else {
state = healthyState
Expand Down
83 changes: 70 additions & 13 deletions pkg/daemon/server/service/health_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,25 @@ func TestNewHealthChecker(t *testing.T) {
}
}

func TestHealthThresholds(t *testing.T) {

hc := NewHealthChecker(&v1alpha1.Pipeline{}, &mockISBService{})
a, _ := hc.GetThresholds()
assert.Equal(t, a, float64(72))

forty := uint32(40)
eighty := uint32(80)

hc.pipeline.Spec.Limits = &v1alpha1.PipelineLimits{BufferUsageLimit: &forty}
hc.udpateThresholds()
c, _ := hc.GetThresholds()
assert.Equal(t, c, float64(36))

hc.pipeline.Spec.Limits = &v1alpha1.PipelineLimits{BufferUsageLimit: &eighty}
hc.udpateThresholds()

}

type mockISBService struct {
isbsvc.ISBService
}
Expand Down Expand Up @@ -576,30 +595,32 @@ func TestAssignStateToBufferUsage(t *testing.T) {
}{
{
name: "Critical state",
ewmaValue: 96,
ewmaValue: v1alpha1.DefaultBufferUsageLimit * 95,
expected: criticalState,
},
{
name: "Warning state",
ewmaValue: 85,
ewmaValue: v1alpha1.DefaultBufferUsageLimit * 90 * 0.86,
expected: warningState,
},
{
name: "Healthy state",
ewmaValue: 30,
ewmaValue: v1alpha1.DefaultBufferUsageLimit * 30,
expected: healthyState,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := assignStateToBufferUsage(tt.ewmaValue)
t.Log(tt.ewmaValue, result)
assert.Equal(t, tt.expected, result)
})
}
}

func TestAssignStateToTimeline(t *testing.T) {
n := v1alpha1.DefaultBufferUsageLimit * 100
tests := []struct {
name string
ewmaValues []float64
Expand All @@ -608,61 +629,61 @@ func TestAssignStateToTimeline(t *testing.T) {
}{
{
name: "Single healthy value",
ewmaValues: []float64{30},
ewmaValues: []float64{n * 0.3},
lookBack: false,
expected: healthyState,
},
{
name: "Single warning value",
ewmaValues: []float64{85},
ewmaValues: []float64{n * 0.85},
lookBack: false,
expected: warningState,
},
{
name: "Single critical value without lookback",
ewmaValues: []float64{96},
ewmaValues: []float64{n * 0.95},
lookBack: false,
expected: criticalState,
},
{
name: "Single critical value with lookback",
ewmaValues: []float64{96},
ewmaValues: []float64{n * 0.95},
lookBack: true,
expected: warningState,
},
{
name: "Multiple values ending with critical, no lookback",
ewmaValues: []float64{30, 85, 96},
ewmaValues: []float64{n * 0.3, n * 0.7, n * 0.92},
lookBack: false,
expected: criticalState,
},
{
name: "Multiple values ending with critical, with lookback, insufficient critical count",
ewmaValues: []float64{30, 85, 96, 96},
ewmaValues: []float64{n * 0.3, n * 0.7, n * 0.95, n * 0.95},
lookBack: true,
expected: warningState,
},
{
name: "Multiple values ending with critical, with lookback, sufficient critical count",
ewmaValues: []float64{96, 96, 96, 96, 96},
ewmaValues: []float64{n * 0.95, n * 0.95, n * 0.95, n * 0.95, n * 0.95},
lookBack: true,
expected: criticalState,
},
{
name: "Values fluctuating between warning and critical",
ewmaValues: []float64{85, 96, 85, 96, 85},
ewmaValues: []float64{n * 0.85, n * 0.95, n * 0.85, n * 0.95, n * 0.85},
lookBack: true,
expected: warningState,
},
{
name: "Values increasing from healthy to critical",
ewmaValues: []float64{30, 50, 70, 90, 96},
ewmaValues: []float64{n * 0.3, n * 0.5, n * 0.7, n * 0.9, n * 0.96},
lookBack: true,
expected: warningState,
},
{
name: "Values decreasing from critical to healthy",
ewmaValues: []float64{96, 90, 70, 50, 30},
ewmaValues: []float64{n * 0.96, n * 0.9, n * 0.7, n * 0.5, n * 0.3},
lookBack: true,
expected: healthyState,
},
Expand All @@ -676,6 +697,42 @@ func TestAssignStateToTimeline(t *testing.T) {
}
}

// func Scalalbility_error_test(t *testing.T) {
// pipelineName := "simple-pipeline"
// namespace := "numaflow-system"
// edges := []v1alpha1.Edge{
// {
// From: "in",
// To: "cat",
// },
// {
// From: "cat",
// To: "out",
// },
// }
// pipeline := &v1alpha1.Pipeline{
// ObjectMeta: metav1.ObjectMeta{
// Name: pipelineName,
// Namespace: namespace,
// },
// Spec: v1alpha1.PipelineSpec{
// Vertices: []v1alpha1.AbstractVertex{
// {Name: "in", Source: &v1alpha1.Source{}},
// {Name: "cat", UDF: &v1alpha1.UDF{}},
// {Name: "out", Sink: &v1alpha1.Sink{}},
// },
// Edges: edges,
// },
// }

// for v := range pipeline.Spec.Vertices {
// pipeline.Spec.Vertices[v].Scale = v1alpha1.Scale{Disabled: true}
// }

// pipeline.

// }

func TestConvertVertexStateToPipelineState(t *testing.T) {
tests := []struct {
name string
Expand Down
Loading