From e07a97a213b46177c9ba918c56931d6501e773b5 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 30 Oct 2024 19:32:17 +0100 Subject: [PATCH] scx_fair: use blocker/waker metrics to determine the latency weight Signed-off-by: Andrea Righi --- scheds/rust/scx_fair/src/bpf/main.bpf.c | 238 ++++++++++++++---------- scheds/rust/scx_fair/src/main.rs | 1 - scheds/rust/scx_fair/src/stats.rs | 6 +- 3 files changed, 137 insertions(+), 108 deletions(-) diff --git a/scheds/rust/scx_fair/src/bpf/main.bpf.c b/scheds/rust/scx_fair/src/bpf/main.bpf.c index 0ef86cf53..a864bbaaf 100644 --- a/scheds/rust/scx_fair/src/bpf/main.bpf.c +++ b/scheds/rust/scx_fair/src/bpf/main.bpf.c @@ -37,8 +37,7 @@ const volatile bool local_kthreads; /* * Scheduling statistics. */ -volatile u64 nr_kthread_dispatches, nr_direct_dispatches, - nr_shared_dispatches, nr_migrate_dispatches; +volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches; /* * Exit information. @@ -103,15 +102,21 @@ struct task_ctx { u64 sum_exec_runtime; /* - * Voluntary context switches metrics. + * Average time slice used by the task. */ - u64 nvcsw; - u64 nvcsw_ts; + u64 avg_runtime; /* - * Task's dynamic priority multiplier. + * Frequency with which a task is blocked (consumer). */ - u64 lat_weight; + u64 blocked_freq; + u64 last_blocked_at; + + /* + * Frequency with which a task wakes other tasks (producer). + */ + u64 waker_freq; + u64 last_woke_at; }; /* Map that contains task-local storage. */ @@ -181,57 +186,65 @@ static inline bool is_kthread(const struct task_struct *p) } /* - * Return the dynamic priority multiplier. + * Return the amount of tasks that are waiting to run. + */ +static inline u64 nr_waiting_tasks(void) +{ + return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1; +} + +/* + * Return the dynamic priority multiplier (only applied in lowlatency mode). * - * The multiplier is evaluated in function of the task's average rate of - * voluntary context switches per second. + * The dynamic priority is evaluated in function of the task's blocker and + * waker frequencies. */ -static u64 task_dyn_prio(struct task_struct *p) +static u64 task_latency_weight(const struct task_struct *p, + const struct task_ctx *tctx) { - struct task_ctx *tctx; + /* + * Prioritize producers (waker_freq) more than consumers + * (blocked_freq). + */ + u64 lat_weight = calc_avg(tctx->waker_freq, tctx->blocked_freq); - tctx = try_lookup_task_ctx(p); - if (!tctx) - return 1; - return MAX(tctx->lat_weight, 1); + return CLAMP(lat_weight, 1, MAX_LATENCY_WEIGHT); } /* * Return task's dynamic priority. */ -static u64 task_prio(struct task_struct *p) +static u64 task_prio(const struct task_struct *p, const struct task_ctx *tctx) { - return p->scx.weight * task_dyn_prio(p); + u64 prio = p->scx.weight * task_latency_weight(p, tctx); + + return CLAMP(prio, 1, 10000); } /* - * Return the task's allowed lag: used to determine how early its deadline it - * can be. + * Return a value inversely proportional to the task's priority. */ -static u64 task_lag(struct task_struct *p) +static u64 scale_inverse_fair(const struct task_struct *p, + const struct task_ctx *tctx, u64 value) { - return MIN(slice_lag * task_prio(p) / 100, NSEC_PER_SEC); + return value * 100 / task_prio(p, tctx); } /* - * Return a value inversely proportional to the task's weight. + * Evaluate the task's deadline component. */ -static inline u64 scale_inverse_fair(struct task_struct *p, u64 value) +static u64 task_compute_dl(const struct task_struct *p, + const struct task_ctx *tctx) { - return value * 100 / task_prio(p, tctx); + return scale_inverse_fair(p, tctx, tctx->avg_runtime); } /* * Return task's evaluated deadline. */ -static inline u64 task_vtime(struct task_struct *p) +static u64 task_deadline(struct task_struct *p, const struct task_ctx *tctx) { - u64 min_vruntime = vtime_now - task_lag(p); - struct task_ctx *tctx; - - tctx = try_lookup_task_ctx(p); - if (!tctx) - return min_vruntime; + u64 min_vruntime = vtime_now - slice_lag; /* * Limit the vruntime to vtime_now minus the maximum task's lag to @@ -240,24 +253,18 @@ static inline u64 task_vtime(struct task_struct *p) if (vtime_before(p->scx.dsq_vtime, min_vruntime)) p->scx.dsq_vtime = min_vruntime; - return p->scx.dsq_vtime; -} - -static inline u64 nr_waiting_tasks(void) -{ - return scx_bpf_dsq_nr_queued(SHARED_DSQ); + return p->scx.dsq_vtime + task_compute_dl(p, tctx); } /* * Evaluate task's time slice in function of the total amount of tasks that are * waiting to be dispatched and the task's weight. */ -static inline void task_refill_slice(struct task_struct *p) +static void task_refill_slice(struct task_struct *p) { - u64 slice, nr_waiting = nr_waiting_tasks(); + u64 nr_waiting = nr_waiting_tasks(); - slice = slice_max / (nr_waiting + 1); - p->scx.slice = CLAMP(slice, slice_min, slice_max); + p->scx.slice = CLAMP(slice_max / nr_waiting, slice_min, slice_max); } /* @@ -470,16 +477,7 @@ s32 BPF_STRUCT_OPS(fair_select_cpu, struct task_struct *p, return cpu; } - /* - * If we couldn't find any idle CPU, return any CPU in the task's - * domain. - * - * There is not guarantee that the task will be dispatched on this CPU, - * but it can help to wake-up in advance a CPU suitable for the task. - */ - cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); - - return cpu >= 0 ? cpu : prev_cpu; + return prev_cpu; } /* @@ -487,10 +485,34 @@ s32 BPF_STRUCT_OPS(fair_select_cpu, struct task_struct *p, */ static void kick_task_cpu(struct task_struct *p) { - s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + s32 cpu = scx_bpf_task_cpu(p); + + /* + * If the task isn't allowed to use its previously used CPU try to pick + * any random idle CPU in its new allowed CPU domain. + */ + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + return; + } + + /* + * For tasks that can run only on a single CPU, we can simply verify if + * their only allowed CPU is idle. + */ + if (p->nr_cpus_allowed == 1 || p->migration_disabled) { + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + } + /* + * Otherwise, use the regular idle CPU selection logic. + */ + cpu = pick_idle_cpu(p, cpu, 0); if (cpu >= 0) - scx_bpf_kick_cpu(cpu, 0); + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); } /* @@ -499,7 +521,11 @@ static void kick_task_cpu(struct task_struct *p) */ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags) { - s32 cpu; + struct task_ctx *tctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; /* * Per-CPU kthreads can be critical for system responsiveness, when @@ -508,7 +534,7 @@ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags) */ if (local_kthreads && is_kthread(p) && p->nr_cpus_allowed == 1) { scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, - enq_flags); + enq_flags | SCX_ENQ_PREEMPT); __sync_fetch_and_add(&nr_kthread_dispatches, 1); return; } @@ -518,7 +544,7 @@ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags) * the first CPU that becomes available. */ scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, - task_vtime(p), enq_flags); + task_deadline(p, tctx), enq_flags); __sync_fetch_and_add(&nr_shared_dispatches, 1); /* @@ -590,6 +616,12 @@ void BPF_STRUCT_OPS(fair_running, struct task_struct *p) * Adjust target CPU frequency before the task starts to run. */ update_cpuperf_target(p); + + /* + * Update global vruntime. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; } /* @@ -598,38 +630,18 @@ void BPF_STRUCT_OPS(fair_running, struct task_struct *p) */ void BPF_STRUCT_OPS(fair_stopping, struct task_struct *p, bool runnable) { - u64 now = bpf_ktime_get_ns(), slice, delta_t; + u64 now = bpf_ktime_get_ns(), slice; s32 cpu = scx_bpf_task_cpu(p); struct cpu_ctx *cctx; struct task_ctx *tctx; - cctx = try_lookup_cpu_ctx(cpu); - if (cctx) - cctx->tot_runtime += now - cctx->last_running; - tctx = try_lookup_task_ctx(p); if (!tctx) return; - /* - * If the time slice is not fully depleted, it means that the task - * voluntarily relased the CPU, therefore update the voluntary context - * switch counter. - * - * NOTE: the sched_ext core implements sched_yield() by setting the - * time slice to 0, so we won't boost the priority of tasks that are - * explicitly calling sched_yield(). - * - * This is actually a good thing, because we want to prioritize tasks - * that are releasing the CPU, because they're doing I/O, waiting for - * input or sending output to other tasks. - * - * Tasks that are using sched_yield() don't really need the priority - * boost and when they get the chance to run again they will be - * naturally prioritized by the vruntime-based scheduling policy. - */ - if (p->scx.slice > 0) - tctx->nvcsw++; + cctx = try_lookup_cpu_ctx(cpu); + if (cctx) + cctx->tot_runtime += now - cctx->last_running; /* * Evaluate task's used time slice. @@ -637,39 +649,57 @@ void BPF_STRUCT_OPS(fair_stopping, struct task_struct *p, bool runnable) slice = CLAMP(p->se.sum_exec_runtime - tctx->sum_exec_runtime, slice_min, slice_max); tctx->sum_exec_runtime = p->se.sum_exec_runtime; - slice = scale_inverse_fair(p, slice); + slice = scale_inverse_fair(p, tctx, slice); + tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice); /* * Update task's vruntime by adding the used time slice, scaled by its * priority. */ p->scx.dsq_vtime += slice; +} - /* - * Update global system vruntime. - */ - vtime_now += slice; +static u64 update_freq(u64 freq, u64 delta) +{ + u64 new_freq; - /* - * Update task's average rate of voluntary context switches per second. - */ - delta_t = (s64)(now - tctx->nvcsw_ts); - if (delta_t > NSEC_PER_SEC) { - /* - * Evaluate the task's latency weight as the task's average - * rate of voluntary context switches per second. - */ - u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t; - u64 lat_weight = MIN(avg_nvcsw, MAX_LATENCY_WEIGHT); + new_freq = NSEC_PER_SEC / delta; + return calc_avg(freq, new_freq); +} - tctx->nvcsw = 0; - tctx->nvcsw_ts = now; - tctx->lat_weight = calc_avg(tctx->lat_weight, lat_weight); - } +void BPF_STRUCT_OPS(fair_runnable, struct task_struct *p, u64 enq_flags) +{ + u64 now = bpf_ktime_get_ns(), delta; + struct task_struct *waker; + struct task_ctx *tctx; + + waker = bpf_get_current_task_btf(); + tctx = try_lookup_task_ctx(waker); + if (!tctx) + return; + + delta = MAX(now - tctx->last_woke_at, 1); + tctx->waker_freq = update_freq(tctx->waker_freq, delta); + tctx->last_woke_at = now; +} + +void BPF_STRUCT_OPS(fair_quiescent, struct task_struct *p, u64 deq_flags) +{ + u64 now = bpf_ktime_get_ns(), delta; + struct task_ctx *tctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + delta = MAX(now - tctx->last_blocked_at, 1); + tctx->blocked_freq = update_freq(tctx->blocked_freq, delta); + tctx->last_blocked_at = now; } void BPF_STRUCT_OPS(fair_enable, struct task_struct *p) { + u64 now = bpf_ktime_get_ns(); struct task_ctx *tctx; p->scx.dsq_vtime = vtime_now; @@ -681,7 +711,8 @@ void BPF_STRUCT_OPS(fair_enable, struct task_struct *p) return; } tctx->sum_exec_runtime = p->se.sum_exec_runtime; - tctx->nvcsw_ts = bpf_ktime_get_ns(); + tctx->last_woke_at = now; + tctx->last_blocked_at = now; } s32 BPF_STRUCT_OPS(fair_init_task, struct task_struct *p, @@ -799,9 +830,12 @@ SCX_OPS_DEFINE(fair_ops, .dispatch = (void *)fair_dispatch, .running = (void *)fair_running, .stopping = (void *)fair_stopping, + .runnable = (void *)fair_runnable, + .quiescent = (void *)fair_quiescent, .enable = (void *)fair_enable, .init_task = (void *)fair_init_task, .init = (void *)fair_init, .exit = (void *)fair_exit, + .flags = SCX_OPS_ENQ_EXITING, .timeout_ms = 5000, .name = "fair"); diff --git a/scheds/rust/scx_fair/src/main.rs b/scheds/rust/scx_fair/src/main.rs index cc94d801e..4000089b6 100644 --- a/scheds/rust/scx_fair/src/main.rs +++ b/scheds/rust/scx_fair/src/main.rs @@ -252,7 +252,6 @@ impl<'a> Scheduler<'a> { nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches, nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches, nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches, - nr_migrate_dispatches: self.skel.maps.bss_data.nr_migrate_dispatches, } } diff --git a/scheds/rust/scx_fair/src/stats.rs b/scheds/rust/scx_fair/src/stats.rs index 543b14c4b..a590ab2f7 100644 --- a/scheds/rust/scx_fair/src/stats.rs +++ b/scheds/rust/scx_fair/src/stats.rs @@ -19,20 +19,17 @@ pub struct Metrics { pub nr_direct_dispatches: u64, #[stat(desc = "Number of task global dispatches")] pub nr_shared_dispatches: u64, - #[stat(desc = "Number of migrated task dispatches")] - pub nr_migrate_dispatches: u64, } impl Metrics { fn format(&self, w: &mut W) -> Result<()> { writeln!( w, - "[{}] dispatch -> kthread: {:<5} direct: {:<5} shared: {:<5} migrated: {:<5}", + "[{}] dispatch -> kthread: {:<5} direct: {:<5} shared: {:<5}", crate::SCHEDULER_NAME, self.nr_kthread_dispatches, self.nr_direct_dispatches, self.nr_shared_dispatches, - self.nr_migrate_dispatches )?; Ok(()) } @@ -42,7 +39,6 @@ impl Metrics { nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches, nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches, nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches, - nr_migrate_dispatches: self.nr_migrate_dispatches - rhs.nr_migrate_dispatches, ..self.clone() } }