From ad1bfee885c0ff0f0d1462918e8c794a74fb9279 Mon Sep 17 00:00:00 2001
From: Daniel Hodges <hodges.daniel.scott@gmail.com>
Date: Wed, 23 Oct 2024 07:32:54 -0700
Subject: [PATCH] scx_layered: Add cost accounting

Add cost accounting for layers to make weights work on the BPF side.
This is done at both the CPU level as well as globally. When a CPU
runs out of budget it acquires budget from the global context. If a
layer runs out of global budgets then all budgets are reset. Weight
handling is done by iterating over layers by their available budget.
Layers budgets are proportional to their weights.
---
 scheds/rust/scx_layered/src/bpf/cost.bpf.c  | 324 ++++++++++++++++++++
 scheds/rust/scx_layered/src/bpf/main.bpf.c  |  49 ++-
 scheds/rust/scx_layered/src/bpf/timer.bpf.h |   2 +-
 3 files changed, 363 insertions(+), 12 deletions(-)
 create mode 100644 scheds/rust/scx_layered/src/bpf/cost.bpf.c

diff --git a/scheds/rust/scx_layered/src/bpf/cost.bpf.c b/scheds/rust/scx_layered/src/bpf/cost.bpf.c
new file mode 100644
index 00000000..e9491646
--- /dev/null
+++ b/scheds/rust/scx_layered/src/bpf/cost.bpf.c
@@ -0,0 +1,324 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+
+/*
+ * Cost accounting struct that is used in both the per CPU and global context.
+ * Budgets are allowed to recurse to parent structs.
+ */
+struct cost {
+	s64		budget[MAX_LAYERS];
+	s64		capacity[MAX_LAYERS];
+	u32		pref_layer;
+	u32		idx;
+	bool		overflow;
+	bool		has_parent;
+};
+
+
+/*
+ * Map used for global cost accounting. Can be extended to support NUMA nodes.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cost);
+	__uint(max_entries, MAX_NUMA_NODES + 1);
+	__uint(map_flags, 0);
+} cost_data SEC(".maps");
+
+/*
+ * CPU map for cost accounting. When budget is expired it requests budget from
+ * global entries.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct cost);
+	__uint(max_entries, 1);
+} cpu_cost_data SEC(".maps");
+
+static __always_inline struct cost *lookup_cost(u32 cost_id)
+{
+	struct cost *costc;
+
+	costc = bpf_map_lookup_elem(&cost_data, &cost_id);
+	if (!costc) {
+		scx_bpf_error("cost not found");
+		return NULL;
+	}
+
+	return costc;
+}
+
+static __always_inline struct cost *lookup_cpu_cost(s32 cpu)
+{
+	struct cost *costc;
+	u32 zero = 0;
+
+	if (cpu < 0)
+		costc = bpf_map_lookup_elem(&cpu_cost_data, &zero);
+	else
+		costc = bpf_map_lookup_percpu_elem(&cpu_cost_data,
+						   &zero, cpu);
+	if (!costc) {
+		scx_bpf_error("cost not found");
+		return NULL;
+	}
+
+	return costc;
+}
+
+/*
+ * Initializes a cost.
+ */
+static struct cost *initialize_cost(u32 cost_idx, u32 parent_idx,
+				    bool is_cpu, bool has_parent,
+				    bool overflow)
+{
+	struct cost *costc;
+
+	if (is_cpu) {
+		if (!(costc = lookup_cpu_cost(cost_idx)))
+			return NULL;
+	} else {
+		if (!(costc = lookup_cost(cost_idx)))
+			return NULL;
+	}
+
+	if (has_parent)
+		costc->idx = parent_idx;
+	else
+		costc->idx = cost_idx;
+
+	costc->has_parent = has_parent;
+	costc->overflow = overflow;
+	costc->pref_layer = bpf_get_smp_processor_id() % nr_layers;
+
+	return costc;
+}
+
+/*
+ * Initializes the cost of a layer.
+ */
+static void initialize_cost_layer(struct cost *costc, u32 layer_id, s64 capacity)
+{
+	costc->capacity[layer_id] = capacity;
+	costc->budget[layer_id] = capacity;
+}
+
+/*
+ * Returns the preferred layer based on the layer with the maximum budget.
+ */
+static u32 preferred_cost(struct cost *costc)
+{
+	u32 layer_id, id, max_layer = 0;
+	s64 max_budget = 0;
+	u32 rotation = bpf_get_smp_processor_id() % nr_layers;
+
+	bpf_for(id, 0, nr_layers) {
+		// If there is two equally weighted layers that have the same
+		// budget we rely on rotating the layers based on the cpu. This
+		// may not work well on low core machines.
+		layer_id = rotate_layer_id(id, rotation);
+		if (layer_id > nr_layers) {
+			scx_bpf_error("invalid layer");
+			return 0;
+		}
+		if (costc->budget[layer_id] > max_budget) {
+			max_budget = costc->budget[layer_id];
+			max_layer = layer_id;
+		}
+	}
+
+	return max_layer;
+}
+
+/*
+ * Refreshes the budget of a cost.
+ */
+int refresh_budget(int cost_id)
+{
+	struct cost *costc;
+
+	if (!(costc = lookup_cost(cost_id))) {
+		scx_bpf_error("failed to lookup cost %d", cost_id);
+		return 0;
+	}
+
+	u32 layer_id, id;
+	u32 rotation = bpf_get_smp_processor_id() % nr_layers;
+	bpf_for(id, 0, nr_layers) {
+		layer_id = rotate_layer_id(id, rotation);
+		if (layer_id > nr_layers) {
+			scx_bpf_error("invalid layer");
+			return 0;
+		}
+		s64 capacity = costc->capacity[layer_id];
+		__sync_lock_test_and_set(MEMBER_VPTR(*costc, .budget[layer_id]),
+					 capacity);
+	}
+
+	return 0;
+}
+
+/*
+ * Refreshes all budgets for all costs.
+ */
+int refresh_budgets(void)
+{
+	refresh_budget(0);
+
+	return 0;
+}
+
+/*
+ * Acquires a budget from a parent cost account.
+ */
+s64 acquire_budget(struct cost *costc, u32 layer_id, s64 amount)
+{
+	s64 budget = 0;
+
+	if (layer_id >= MAX_LAYERS || layer_id < 0) {
+		scx_bpf_error("invalid parent cost");
+		return budget;
+	}
+
+	if (!costc || !costc->has_parent)
+		return budget;
+
+
+	struct cost *parent_cost;
+	if (!(parent_cost = lookup_cost(costc->idx)))
+		return budget;
+
+	__sync_fetch_and_sub(&parent_cost->budget[layer_id], amount);
+
+	if (parent_cost->budget[layer_id] < 0)
+		refresh_budgets();
+
+	return amount;
+}
+
+/*
+ * Records the cost to the CPU budget. If the CPU is out of cost the CPU will
+ * acquire budget by either retrieving budget from the global context or
+ * refreshing all budgets.
+ */
+static int record_cpu_cost(struct cost *costc, u32 layer_id, s64 amount)
+{
+	if (layer_id >= MAX_LAYERS || !costc) {
+		scx_bpf_error("invalid layer %d", layer_id);
+		return 0;
+	}
+
+	__sync_fetch_and_sub(&costc->budget[layer_id], amount);
+
+	if (costc->budget[layer_id] <= 0) {
+		if (costc->has_parent) {
+			s64 budget = acquire_budget(costc, layer_id,
+						    costc->capacity[layer_id] + amount);
+			if (budget > 0) {
+				__sync_fetch_and_add(MEMBER_VPTR(*costc, .budget[layer_id]),
+						     costc->capacity[layer_id]);
+			}
+		}
+	}
+
+	u32 pref_layer = preferred_cost(costc);
+	if (pref_layer > nr_layers) {
+		scx_bpf_error("invalid pref_layer");
+		return 0;
+	}
+
+	costc->pref_layer = pref_layer;
+
+	return 0;
+}
+
+/*
+ * Returns the slice_ns of a layer if there is appropriate budget.
+ */
+int has_budget(struct cost *costc, struct layer *layer)
+{
+	if (!layer || !costc) {
+		scx_bpf_error("can't happen");
+		return 0;
+	}
+
+	u32 layer_id = layer->idx;
+	if (layer_id > nr_layers) {
+		scx_bpf_error("invalid layer %d", layer_id);
+		return 0;
+	}
+
+	s64 budget = *MEMBER_VPTR(*costc, .budget[layer_id]);
+	u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns;
+
+	if (budget > layer_slice_ns)
+		return slice_ns;
+
+	return 0;
+}
+
+/*
+ * Initializes all budgets.
+ */
+static void initialize_budgets(u64 refresh_intvl_ns)
+{
+	struct layer *layer;
+	struct cost *costc;
+	int layer_id;
+	u64 layer_weight_dur, layer_weight_sum = 0;
+	s32 cpu;
+	u32 global = 0;
+
+	bpf_for(layer_id, 0, nr_layers) {
+		layer = &layers[layer_id];
+		if (!layer) {
+			scx_bpf_error("failed to lookup layer %d", layer_id);
+			return;
+		}
+		layer_weight_sum += layer->weight;
+	}
+
+	costc = initialize_cost(global, global, false, false, false);
+	if (!costc) {
+		scx_bpf_error("failed to initialize global budget");
+		return;
+	}
+
+	bpf_for(layer_id, 0, nr_layers) {
+		layer = &layers[layer_id];
+		if (!layer) {
+			scx_bpf_error("failed to lookup layer %d", layer_id);
+			return;
+		}
+		u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns;
+
+		layer_weight_dur = (layer->weight * ((u64)refresh_intvl_ns * nr_possible_cpus)) /
+				    layer_weight_sum;
+		initialize_cost_layer(costc, layer_id, (s64)layer_weight_dur);
+		trace("BUDGET init global layer %d budget %lld",
+		      layer_id, costc->budget[layer_id]);
+
+		// TODO: add L3 budgets for topology awareness
+
+		bpf_for(cpu, 0, nr_possible_cpus) {
+			costc = initialize_cost(cpu, global, true,
+						      true, false);
+			if (!costc) {
+				scx_bpf_error("failed to cpu budget: %d", cpu);
+				return;
+			}
+			layer_weight_dur = (layer->weight * layer_slice_ns * refresh_intvl_ns) /
+					    layer_weight_sum;
+			initialize_cost_layer(costc, layer_id, (s64)layer_weight_dur);
+			if (cpu == 0)
+				trace("BUDGET init cpu %d layer %d budget %lld",
+				      cpu, layer_id, costc->budget[layer_id]);
+		}
+	}
+}
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index 187b7d72..51fb1551 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -51,7 +51,6 @@ static u32 preempt_cursor;
 
 #include "util.bpf.c"
 
-
 UEI_DEFINE(uei);
 
 static inline bool vtime_before(u64 a, u64 b)
@@ -357,6 +356,10 @@ static bool refresh_cpumasks(int idx)
 	return total > 0;
 }
 
+// TODO: Refactor includes that have circular dependencies. This import must be
+// defined after some helpers, but before it's helpers are used.
+#include "cost.bpf.c"
+
 SEC("fentry")
 int BPF_PROG(sched_tick_fentry)
 {
@@ -775,6 +778,7 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
 		     struct task_ctx *tctx, struct layer *layer,
 		     bool preempt_first)
 {
+	struct cost *cost;
 	struct cpu_ctx *cand_cctx, *sib_cctx = NULL;
 	s32 sib;
 
@@ -784,6 +788,9 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
 	if (!(cand_cctx = lookup_cpu_ctx(cand)) || cand_cctx->current_preempt)
 		return false;
 
+	if (!(cost = lookup_cpu_cost(cand)) || has_budget(cost, layer) == 0)
+		return false;
+
 	/*
 	 * If exclusive, we want to make sure the sibling CPU, if there's
 	 * one, is idle. However, if the sibling CPU is already running a
@@ -1206,11 +1213,12 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 {
 	struct cpu_ctx *cctx, *sib_cctx;
 	struct layer *layer;
+	struct cost *cost;
 	u64 dsq_id;
 	u32 idx, layer_idx;
 	s32 sib = sibling_cpu(cpu);
 
-	if (!(cctx = lookup_cpu_ctx(-1)))
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(cost = lookup_cpu_cost(cpu)))
 		return;
 
 	/*
@@ -1241,12 +1249,14 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 
 	/* consume preempting layers first */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = rotate_layer_id(cctx->layer_idx, idx);
+		layer_idx = rotate_layer_id(cost->pref_layer, idx);
 		if (layer_idx >= nr_layers) {
 			scx_bpf_error("can't happen");
 			return;
 		}
 		layer = MEMBER_VPTR(layers, [layer_idx]);
+		if (has_budget(cost, layer) == 0)
+			continue;
 		if (layer->preempt && scx_bpf_consume(layer_idx))
 			return;
 	}
@@ -1257,12 +1267,14 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 
 	/* consume !open layers second */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = rotate_layer_id(cctx->layer_idx, idx);
+		layer_idx = rotate_layer_id(cost->pref_layer, idx);
 		if (layer_idx >= nr_layers) {
 			scx_bpf_error("can't happen");
 			return;
 		}
 		layer = MEMBER_VPTR(layers, [layer_idx]);
+		if (has_budget(cost, layer) == 0)
+			continue;
 		struct cpumask *layer_cpumask;
 
 		/* consume matching layers */
@@ -1278,12 +1290,14 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 
 	/* consume !preempting open layers */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = rotate_layer_id(cctx->layer_idx, idx);
+		layer_idx = rotate_layer_id(cost->pref_layer, idx);
 		if (layer_idx >= nr_layers) {
 			scx_bpf_error("can't happen");
 			return;
 		}
 		layer = MEMBER_VPTR(layers, [layer_idx]);
+		if (has_budget(cost, layer) == 0)
+			continue;
 		if (!layer->preempt && layers->open &&
 		    scx_bpf_consume(layer_idx))
 			return;
@@ -1299,11 +1313,13 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 
 	struct cpu_ctx *cctx, *sib_cctx;
 	struct layer *layer;
+	struct cost *cost;
 	u64 dsq_id;
 	u32 idx, llc_idx, layer_idx;
 	s32 sib = sibling_cpu(cpu);
 
-	if (!(cctx = lookup_cpu_ctx(-1)))
+	if (!(cctx = lookup_cpu_ctx(-1)) ||
+	    !(cost = lookup_cpu_cost(cpu)))
 		return;
 
 	/*
@@ -1336,12 +1352,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 
 	/* consume preempting layers first */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = rotate_layer_id(cctx->layer_idx, idx);
+		layer_idx = rotate_layer_id(cost->pref_layer, idx);
 		if (layer_idx >= nr_layers) {
 			scx_bpf_error("can't happen");
 			return;
 		}
 		layer = MEMBER_VPTR(layers, [layer_idx]);
+		if (has_budget(cost, layer) == 0)
+			continue;
 		bpf_for(llc_idx, 0, nr_llcs) {
 			u32 llc_id = rotate_llc_id(my_llc_id, llc_idx);
 			dsq_id = layer_dsq_id(layer_idx, llc_id);
@@ -1356,12 +1374,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 
 	/* consume !open layers second */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = rotate_layer_id(cctx->layer_idx, idx);
+		layer_idx = rotate_layer_id(cost->pref_layer, idx);
 		if (layer_idx >= nr_layers) {
 			scx_bpf_error("can't happen");
 			return;
 		}
 		layer = MEMBER_VPTR(layers, [layer_idx]);
+		if (has_budget(cost, layer) == 0)
+			continue;
 		bpf_for(llc_idx, 0, nr_llcs) {
 			u32 llc_id = rotate_llc_id(my_llc_id, llc_idx);
 			struct cpumask *layer_cpumask;
@@ -1382,12 +1402,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 
 	/* consume !preempting open layers */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = rotate_layer_id(cctx->layer_idx, idx);
+		layer_idx = rotate_layer_id(cost->pref_layer, idx);
 		if (layer_idx >= nr_layers) {
 			scx_bpf_error("can't happen");
 			return;
 		}
 		layer = MEMBER_VPTR(layers, [layer_idx]);
+		if (has_budget(cost, layer) == 0)
+			continue;
 		bpf_for(llc_idx, 0, nr_llcs) {
 			u32 llc_id = rotate_llc_id(my_llc_id, llc_idx);
 			dsq_id = layer_dsq_id(layer_idx, llc_id);
@@ -1800,6 +1822,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
 	struct layer *layer;
+	struct cost *cost;
 	s32 lidx;
 	u64 used;
 
@@ -1807,7 +1830,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 		return;
 
 	lidx = tctx->layer;
-	if (!(layer = lookup_layer(lidx)))
+	if (!(layer = lookup_layer(lidx)) || !(cost = lookup_cpu_cost(-1)))
 		return;
 
 	used = bpf_ktime_get_ns() - tctx->running_at;
@@ -1817,6 +1840,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 		used = layer->min_exec_ns;
 	}
 
+	record_cpu_cost(cost, layer->idx, (s64)used);
 	cctx->layer_cycles[lidx] += used;
 	cctx->current_preempt = false;
 	cctx->prev_exclusive = cctx->current_exclusive;
@@ -2286,7 +2310,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 			}
 		}
 	}
-	start_layered_timers();
+	initialize_budgets(1000LLU * NSEC_PER_MSEC);
+	ret = start_layered_timers();
+	if (ret < 0)
+		return ret;
 
 	return 0;
 }
diff --git a/scheds/rust/scx_layered/src/bpf/timer.bpf.h b/scheds/rust/scx_layered/src/bpf/timer.bpf.h
index 1395ea9a..726f26b3 100644
--- a/scheds/rust/scx_layered/src/bpf/timer.bpf.h
+++ b/scheds/rust/scx_layered/src/bpf/timer.bpf.h
@@ -20,7 +20,7 @@ struct layered_timer {
 	// if set to 0 the timer will only be scheduled once
 	u64 interval_ns;
 	u64 init_flags;
-	u64 start_flags;
+	int start_flags;
 };
 
 enum layer_timer_callbacks {