From ad1bfee885c0ff0f0d1462918e8c794a74fb9279 Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Wed, 23 Oct 2024 07:32:54 -0700 Subject: [PATCH] scx_layered: Add cost accounting Add cost accounting for layers to make weights work on the BPF side. This is done at both the CPU level as well as globally. When a CPU runs out of budget it acquires budget from the global context. If a layer runs out of global budgets then all budgets are reset. Weight handling is done by iterating over layers by their available budget. Layers budgets are proportional to their weights. --- scheds/rust/scx_layered/src/bpf/cost.bpf.c | 324 ++++++++++++++++++++ scheds/rust/scx_layered/src/bpf/main.bpf.c | 49 ++- scheds/rust/scx_layered/src/bpf/timer.bpf.h | 2 +- 3 files changed, 363 insertions(+), 12 deletions(-) create mode 100644 scheds/rust/scx_layered/src/bpf/cost.bpf.c diff --git a/scheds/rust/scx_layered/src/bpf/cost.bpf.c b/scheds/rust/scx_layered/src/bpf/cost.bpf.c new file mode 100644 index 00000000..e9491646 --- /dev/null +++ b/scheds/rust/scx_layered/src/bpf/cost.bpf.c @@ -0,0 +1,324 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + + +/* + * Cost accounting struct that is used in both the per CPU and global context. + * Budgets are allowed to recurse to parent structs. + */ +struct cost { + s64 budget[MAX_LAYERS]; + s64 capacity[MAX_LAYERS]; + u32 pref_layer; + u32 idx; + bool overflow; + bool has_parent; +}; + + +/* + * Map used for global cost accounting. Can be extended to support NUMA nodes. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cost); + __uint(max_entries, MAX_NUMA_NODES + 1); + __uint(map_flags, 0); +} cost_data SEC(".maps"); + +/* + * CPU map for cost accounting. When budget is expired it requests budget from + * global entries. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct cost); + __uint(max_entries, 1); +} cpu_cost_data SEC(".maps"); + +static __always_inline struct cost *lookup_cost(u32 cost_id) +{ + struct cost *costc; + + costc = bpf_map_lookup_elem(&cost_data, &cost_id); + if (!costc) { + scx_bpf_error("cost not found"); + return NULL; + } + + return costc; +} + +static __always_inline struct cost *lookup_cpu_cost(s32 cpu) +{ + struct cost *costc; + u32 zero = 0; + + if (cpu < 0) + costc = bpf_map_lookup_elem(&cpu_cost_data, &zero); + else + costc = bpf_map_lookup_percpu_elem(&cpu_cost_data, + &zero, cpu); + if (!costc) { + scx_bpf_error("cost not found"); + return NULL; + } + + return costc; +} + +/* + * Initializes a cost. + */ +static struct cost *initialize_cost(u32 cost_idx, u32 parent_idx, + bool is_cpu, bool has_parent, + bool overflow) +{ + struct cost *costc; + + if (is_cpu) { + if (!(costc = lookup_cpu_cost(cost_idx))) + return NULL; + } else { + if (!(costc = lookup_cost(cost_idx))) + return NULL; + } + + if (has_parent) + costc->idx = parent_idx; + else + costc->idx = cost_idx; + + costc->has_parent = has_parent; + costc->overflow = overflow; + costc->pref_layer = bpf_get_smp_processor_id() % nr_layers; + + return costc; +} + +/* + * Initializes the cost of a layer. + */ +static void initialize_cost_layer(struct cost *costc, u32 layer_id, s64 capacity) +{ + costc->capacity[layer_id] = capacity; + costc->budget[layer_id] = capacity; +} + +/* + * Returns the preferred layer based on the layer with the maximum budget. + */ +static u32 preferred_cost(struct cost *costc) +{ + u32 layer_id, id, max_layer = 0; + s64 max_budget = 0; + u32 rotation = bpf_get_smp_processor_id() % nr_layers; + + bpf_for(id, 0, nr_layers) { + // If there is two equally weighted layers that have the same + // budget we rely on rotating the layers based on the cpu. This + // may not work well on low core machines. + layer_id = rotate_layer_id(id, rotation); + if (layer_id > nr_layers) { + scx_bpf_error("invalid layer"); + return 0; + } + if (costc->budget[layer_id] > max_budget) { + max_budget = costc->budget[layer_id]; + max_layer = layer_id; + } + } + + return max_layer; +} + +/* + * Refreshes the budget of a cost. + */ +int refresh_budget(int cost_id) +{ + struct cost *costc; + + if (!(costc = lookup_cost(cost_id))) { + scx_bpf_error("failed to lookup cost %d", cost_id); + return 0; + } + + u32 layer_id, id; + u32 rotation = bpf_get_smp_processor_id() % nr_layers; + bpf_for(id, 0, nr_layers) { + layer_id = rotate_layer_id(id, rotation); + if (layer_id > nr_layers) { + scx_bpf_error("invalid layer"); + return 0; + } + s64 capacity = costc->capacity[layer_id]; + __sync_lock_test_and_set(MEMBER_VPTR(*costc, .budget[layer_id]), + capacity); + } + + return 0; +} + +/* + * Refreshes all budgets for all costs. + */ +int refresh_budgets(void) +{ + refresh_budget(0); + + return 0; +} + +/* + * Acquires a budget from a parent cost account. + */ +s64 acquire_budget(struct cost *costc, u32 layer_id, s64 amount) +{ + s64 budget = 0; + + if (layer_id >= MAX_LAYERS || layer_id < 0) { + scx_bpf_error("invalid parent cost"); + return budget; + } + + if (!costc || !costc->has_parent) + return budget; + + + struct cost *parent_cost; + if (!(parent_cost = lookup_cost(costc->idx))) + return budget; + + __sync_fetch_and_sub(&parent_cost->budget[layer_id], amount); + + if (parent_cost->budget[layer_id] < 0) + refresh_budgets(); + + return amount; +} + +/* + * Records the cost to the CPU budget. If the CPU is out of cost the CPU will + * acquire budget by either retrieving budget from the global context or + * refreshing all budgets. + */ +static int record_cpu_cost(struct cost *costc, u32 layer_id, s64 amount) +{ + if (layer_id >= MAX_LAYERS || !costc) { + scx_bpf_error("invalid layer %d", layer_id); + return 0; + } + + __sync_fetch_and_sub(&costc->budget[layer_id], amount); + + if (costc->budget[layer_id] <= 0) { + if (costc->has_parent) { + s64 budget = acquire_budget(costc, layer_id, + costc->capacity[layer_id] + amount); + if (budget > 0) { + __sync_fetch_and_add(MEMBER_VPTR(*costc, .budget[layer_id]), + costc->capacity[layer_id]); + } + } + } + + u32 pref_layer = preferred_cost(costc); + if (pref_layer > nr_layers) { + scx_bpf_error("invalid pref_layer"); + return 0; + } + + costc->pref_layer = pref_layer; + + return 0; +} + +/* + * Returns the slice_ns of a layer if there is appropriate budget. + */ +int has_budget(struct cost *costc, struct layer *layer) +{ + if (!layer || !costc) { + scx_bpf_error("can't happen"); + return 0; + } + + u32 layer_id = layer->idx; + if (layer_id > nr_layers) { + scx_bpf_error("invalid layer %d", layer_id); + return 0; + } + + s64 budget = *MEMBER_VPTR(*costc, .budget[layer_id]); + u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns; + + if (budget > layer_slice_ns) + return slice_ns; + + return 0; +} + +/* + * Initializes all budgets. + */ +static void initialize_budgets(u64 refresh_intvl_ns) +{ + struct layer *layer; + struct cost *costc; + int layer_id; + u64 layer_weight_dur, layer_weight_sum = 0; + s32 cpu; + u32 global = 0; + + bpf_for(layer_id, 0, nr_layers) { + layer = &layers[layer_id]; + if (!layer) { + scx_bpf_error("failed to lookup layer %d", layer_id); + return; + } + layer_weight_sum += layer->weight; + } + + costc = initialize_cost(global, global, false, false, false); + if (!costc) { + scx_bpf_error("failed to initialize global budget"); + return; + } + + bpf_for(layer_id, 0, nr_layers) { + layer = &layers[layer_id]; + if (!layer) { + scx_bpf_error("failed to lookup layer %d", layer_id); + return; + } + u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns; + + layer_weight_dur = (layer->weight * ((u64)refresh_intvl_ns * nr_possible_cpus)) / + layer_weight_sum; + initialize_cost_layer(costc, layer_id, (s64)layer_weight_dur); + trace("BUDGET init global layer %d budget %lld", + layer_id, costc->budget[layer_id]); + + // TODO: add L3 budgets for topology awareness + + bpf_for(cpu, 0, nr_possible_cpus) { + costc = initialize_cost(cpu, global, true, + true, false); + if (!costc) { + scx_bpf_error("failed to cpu budget: %d", cpu); + return; + } + layer_weight_dur = (layer->weight * layer_slice_ns * refresh_intvl_ns) / + layer_weight_sum; + initialize_cost_layer(costc, layer_id, (s64)layer_weight_dur); + if (cpu == 0) + trace("BUDGET init cpu %d layer %d budget %lld", + cpu, layer_id, costc->budget[layer_id]); + } + } +} diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index 187b7d72..51fb1551 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -51,7 +51,6 @@ static u32 preempt_cursor; #include "util.bpf.c" - UEI_DEFINE(uei); static inline bool vtime_before(u64 a, u64 b) @@ -357,6 +356,10 @@ static bool refresh_cpumasks(int idx) return total > 0; } +// TODO: Refactor includes that have circular dependencies. This import must be +// defined after some helpers, but before it's helpers are used. +#include "cost.bpf.c" + SEC("fentry") int BPF_PROG(sched_tick_fentry) { @@ -775,6 +778,7 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx, struct task_ctx *tctx, struct layer *layer, bool preempt_first) { + struct cost *cost; struct cpu_ctx *cand_cctx, *sib_cctx = NULL; s32 sib; @@ -784,6 +788,9 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx, if (!(cand_cctx = lookup_cpu_ctx(cand)) || cand_cctx->current_preempt) return false; + if (!(cost = lookup_cpu_cost(cand)) || has_budget(cost, layer) == 0) + return false; + /* * If exclusive, we want to make sure the sibling CPU, if there's * one, is idle. However, if the sibling CPU is already running a @@ -1206,11 +1213,12 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev) { struct cpu_ctx *cctx, *sib_cctx; struct layer *layer; + struct cost *cost; u64 dsq_id; u32 idx, layer_idx; s32 sib = sibling_cpu(cpu); - if (!(cctx = lookup_cpu_ctx(-1))) + if (!(cctx = lookup_cpu_ctx(-1)) || !(cost = lookup_cpu_cost(cpu))) return; /* @@ -1241,12 +1249,14 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev) /* consume preempting layers first */ bpf_for(idx, 0, nr_layers) { - layer_idx = rotate_layer_id(cctx->layer_idx, idx); + layer_idx = rotate_layer_id(cost->pref_layer, idx); if (layer_idx >= nr_layers) { scx_bpf_error("can't happen"); return; } layer = MEMBER_VPTR(layers, [layer_idx]); + if (has_budget(cost, layer) == 0) + continue; if (layer->preempt && scx_bpf_consume(layer_idx)) return; } @@ -1257,12 +1267,14 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev) /* consume !open layers second */ bpf_for(idx, 0, nr_layers) { - layer_idx = rotate_layer_id(cctx->layer_idx, idx); + layer_idx = rotate_layer_id(cost->pref_layer, idx); if (layer_idx >= nr_layers) { scx_bpf_error("can't happen"); return; } layer = MEMBER_VPTR(layers, [layer_idx]); + if (has_budget(cost, layer) == 0) + continue; struct cpumask *layer_cpumask; /* consume matching layers */ @@ -1278,12 +1290,14 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev) /* consume !preempting open layers */ bpf_for(idx, 0, nr_layers) { - layer_idx = rotate_layer_id(cctx->layer_idx, idx); + layer_idx = rotate_layer_id(cost->pref_layer, idx); if (layer_idx >= nr_layers) { scx_bpf_error("can't happen"); return; } layer = MEMBER_VPTR(layers, [layer_idx]); + if (has_budget(cost, layer) == 0) + continue; if (!layer->preempt && layers->open && scx_bpf_consume(layer_idx)) return; @@ -1299,11 +1313,13 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev) struct cpu_ctx *cctx, *sib_cctx; struct layer *layer; + struct cost *cost; u64 dsq_id; u32 idx, llc_idx, layer_idx; s32 sib = sibling_cpu(cpu); - if (!(cctx = lookup_cpu_ctx(-1))) + if (!(cctx = lookup_cpu_ctx(-1)) || + !(cost = lookup_cpu_cost(cpu))) return; /* @@ -1336,12 +1352,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev) /* consume preempting layers first */ bpf_for(idx, 0, nr_layers) { - layer_idx = rotate_layer_id(cctx->layer_idx, idx); + layer_idx = rotate_layer_id(cost->pref_layer, idx); if (layer_idx >= nr_layers) { scx_bpf_error("can't happen"); return; } layer = MEMBER_VPTR(layers, [layer_idx]); + if (has_budget(cost, layer) == 0) + continue; bpf_for(llc_idx, 0, nr_llcs) { u32 llc_id = rotate_llc_id(my_llc_id, llc_idx); dsq_id = layer_dsq_id(layer_idx, llc_id); @@ -1356,12 +1374,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev) /* consume !open layers second */ bpf_for(idx, 0, nr_layers) { - layer_idx = rotate_layer_id(cctx->layer_idx, idx); + layer_idx = rotate_layer_id(cost->pref_layer, idx); if (layer_idx >= nr_layers) { scx_bpf_error("can't happen"); return; } layer = MEMBER_VPTR(layers, [layer_idx]); + if (has_budget(cost, layer) == 0) + continue; bpf_for(llc_idx, 0, nr_llcs) { u32 llc_id = rotate_llc_id(my_llc_id, llc_idx); struct cpumask *layer_cpumask; @@ -1382,12 +1402,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev) /* consume !preempting open layers */ bpf_for(idx, 0, nr_layers) { - layer_idx = rotate_layer_id(cctx->layer_idx, idx); + layer_idx = rotate_layer_id(cost->pref_layer, idx); if (layer_idx >= nr_layers) { scx_bpf_error("can't happen"); return; } layer = MEMBER_VPTR(layers, [layer_idx]); + if (has_budget(cost, layer) == 0) + continue; bpf_for(llc_idx, 0, nr_llcs) { u32 llc_id = rotate_llc_id(my_llc_id, llc_idx); dsq_id = layer_dsq_id(layer_idx, llc_id); @@ -1800,6 +1822,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) struct cpu_ctx *cctx; struct task_ctx *tctx; struct layer *layer; + struct cost *cost; s32 lidx; u64 used; @@ -1807,7 +1830,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) return; lidx = tctx->layer; - if (!(layer = lookup_layer(lidx))) + if (!(layer = lookup_layer(lidx)) || !(cost = lookup_cpu_cost(-1))) return; used = bpf_ktime_get_ns() - tctx->running_at; @@ -1817,6 +1840,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) used = layer->min_exec_ns; } + record_cpu_cost(cost, layer->idx, (s64)used); cctx->layer_cycles[lidx] += used; cctx->current_preempt = false; cctx->prev_exclusive = cctx->current_exclusive; @@ -2286,7 +2310,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) } } } - start_layered_timers(); + initialize_budgets(1000LLU * NSEC_PER_MSEC); + ret = start_layered_timers(); + if (ret < 0) + return ret; return 0; } diff --git a/scheds/rust/scx_layered/src/bpf/timer.bpf.h b/scheds/rust/scx_layered/src/bpf/timer.bpf.h index 1395ea9a..726f26b3 100644 --- a/scheds/rust/scx_layered/src/bpf/timer.bpf.h +++ b/scheds/rust/scx_layered/src/bpf/timer.bpf.h @@ -20,7 +20,7 @@ struct layered_timer { // if set to 0 the timer will only be scheduled once u64 interval_ns; u64 init_flags; - u64 start_flags; + int start_flags; }; enum layer_timer_callbacks {