scx_fair: use blocker/waker metrics to determine the latency weight

Signed-off-by: Andrea Righi <[email protected]>
sched-ext · Nov 1, 2024 · e07a97a · e07a97a
1 parent 07de4dd
commit e07a97a
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 108 deletions.
diff --git a/scheds/rust/scx_fair/src/bpf/main.bpf.c b/scheds/rust/scx_fair/src/bpf/main.bpf.c
@@ -37,8 +37,7 @@ const volatile bool local_kthreads;
 /*
  * Scheduling statistics.
  */
-volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
-	     nr_shared_dispatches, nr_migrate_dispatches;
+volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches;
 
 /*
  * Exit information.
@@ -103,15 +102,21 @@ struct task_ctx {
 	u64 sum_exec_runtime;
 
 	/*
-	 * Voluntary context switches metrics.
+	 * Average time slice used by the task.
 	 */
-	u64 nvcsw;
-	u64 nvcsw_ts;
+	u64 avg_runtime;
 
 	/*
-	 * Task's dynamic priority multiplier.
+	 * Frequency with which a task is blocked (consumer).
 	 */
-	u64 lat_weight;
+	u64 blocked_freq;
+	u64 last_blocked_at;
+
+	/*
+	 * Frequency with which a task wakes other tasks (producer).
+	 */
+	u64 waker_freq;
+	u64 last_woke_at;
 };
 
 /* Map that contains task-local storage. */
@@ -181,57 +186,65 @@ static inline bool is_kthread(const struct task_struct *p)
 }
 
 /*
- * Return the dynamic priority multiplier.
+ * Return the amount of tasks that are waiting to run.
+ */
+static inline u64 nr_waiting_tasks(void)
+{
+	return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1;
+}
+
+/*
+ * Return the dynamic priority multiplier (only applied in lowlatency mode).
  *
- * The multiplier is evaluated in function of the task's average rate of
- * voluntary context switches per second.
+ * The dynamic priority is evaluated in function of the task's blocker and
+ * waker frequencies.
  */
-static u64 task_dyn_prio(struct task_struct *p)
+static u64 task_latency_weight(const struct task_struct *p,
+			       const struct task_ctx *tctx)
 {
-	struct task_ctx *tctx;
+	/*
+	 * Prioritize producers (waker_freq) more than consumers
+	 * (blocked_freq).
+	 */
+	u64 lat_weight = calc_avg(tctx->waker_freq, tctx->blocked_freq);
 
-	tctx = try_lookup_task_ctx(p);
-	if (!tctx)
-		return 1;
-	return MAX(tctx->lat_weight, 1);
+	return CLAMP(lat_weight, 1, MAX_LATENCY_WEIGHT);
 }
 
 /*
  * Return task's dynamic priority.
  */
-static u64 task_prio(struct task_struct *p)
+static u64 task_prio(const struct task_struct *p, const struct task_ctx *tctx)
 {
-	return p->scx.weight * task_dyn_prio(p);
+	u64 prio = p->scx.weight * task_latency_weight(p, tctx);
+
+	return CLAMP(prio, 1, 10000);
 }
 
 /*
- * Return the task's allowed lag: used to determine how early its deadline it
- * can be.
+ * Return a value inversely proportional to the task's priority.
  */
-static u64 task_lag(struct task_struct *p)
+static u64 scale_inverse_fair(const struct task_struct *p,
+			      const struct task_ctx *tctx, u64 value)
 {
-	return MIN(slice_lag * task_prio(p) / 100, NSEC_PER_SEC);
+	return value * 100 / task_prio(p, tctx);
 }
 
 /*
- * Return a value inversely proportional to the task's weight.
+ * Evaluate the task's deadline component.
  */
-static inline u64 scale_inverse_fair(struct task_struct *p, u64 value)
+static u64 task_compute_dl(const struct task_struct *p,
+			   const struct task_ctx *tctx)
 {
-	return value * 100 / task_prio(p, tctx);
+	return scale_inverse_fair(p, tctx, tctx->avg_runtime);
 }
 
 /*
  * Return task's evaluated deadline.
  */
-static inline u64 task_vtime(struct task_struct *p)
+static u64 task_deadline(struct task_struct *p, const struct task_ctx *tctx)
 {
-	u64 min_vruntime = vtime_now - task_lag(p);
-	struct task_ctx *tctx;
-
-	tctx = try_lookup_task_ctx(p);
-	if (!tctx)
-		return min_vruntime;
+	u64 min_vruntime = vtime_now - slice_lag;
 
 	/*
 	 * Limit the vruntime to vtime_now minus the maximum task's lag to
@@ -240,24 +253,18 @@ static inline u64 task_vtime(struct task_struct *p)
 	if (vtime_before(p->scx.dsq_vtime, min_vruntime))
 		p->scx.dsq_vtime = min_vruntime;
 
-	return p->scx.dsq_vtime;
-}
-
-static inline u64 nr_waiting_tasks(void)
-{
-	return scx_bpf_dsq_nr_queued(SHARED_DSQ);
+	return p->scx.dsq_vtime + task_compute_dl(p, tctx);
 }
 
 /*
  * Evaluate task's time slice in function of the total amount of tasks that are
  * waiting to be dispatched and the task's weight.
  */
-static inline void task_refill_slice(struct task_struct *p)
+static void task_refill_slice(struct task_struct *p)
 {
-	u64 slice, nr_waiting = nr_waiting_tasks();
+	u64 nr_waiting = nr_waiting_tasks();
 
-	slice = slice_max / (nr_waiting + 1);
-	p->scx.slice = CLAMP(slice, slice_min, slice_max);
+	p->scx.slice = CLAMP(slice_max / nr_waiting, slice_min, slice_max);
 }
 
 /*
@@ -470,27 +477,42 @@ s32 BPF_STRUCT_OPS(fair_select_cpu, struct task_struct *p,
 		return cpu;
 	}
 
-	/*
-	 * If we couldn't find any idle CPU, return any CPU in the task's
-	 * domain.
-	 *
-	 * There is not guarantee that the task will be dispatched on this CPU,
-	 * but it can help to wake-up in advance a CPU suitable for the task.
-	 */
-	cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
-
-	return cpu >= 0 ? cpu : prev_cpu;
+	return prev_cpu;
 }
 
 /*
  * Wake up an idle CPU for task @p.
  */
 static void kick_task_cpu(struct task_struct *p)
 {
-	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	s32 cpu = scx_bpf_task_cpu(p);
+
+	/*
+	* If the task isn't allowed to use its previously used CPU try to pick
+	* any random idle CPU in its new allowed CPU domain.
+	*/
+	if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+		if (cpu >= 0)
+			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+		return;
+	}
+
+	/*
+	 * For tasks that can run only on a single CPU, we can simply verify if
+	 * their only allowed CPU is idle.
+	 */
+	if (p->nr_cpus_allowed == 1 || p->migration_disabled) {
+		if (scx_bpf_test_and_clear_cpu_idle(cpu))
+			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+	}
 
+	/*
+	 * Otherwise, use the regular idle CPU selection logic.
+	 */
+	cpu = pick_idle_cpu(p, cpu, 0);
 	if (cpu >= 0)
-		scx_bpf_kick_cpu(cpu, 0);
+		scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 }
 
 /*
@@ -499,7 +521,11 @@ static void kick_task_cpu(struct task_struct *p)
  */
 void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	s32 cpu;
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
 
 	/*
 	 * Per-CPU kthreads can be critical for system responsiveness, when
@@ -508,7 +534,7 @@ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if (local_kthreads && is_kthread(p) && p->nr_cpus_allowed == 1) {
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
-				 enq_flags);
+				 enq_flags | SCX_ENQ_PREEMPT);
 		__sync_fetch_and_add(&nr_kthread_dispatches, 1);
 		return;
 	}
@@ -518,7 +544,7 @@ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags)
 	 * the first CPU that becomes available.
 	 */
 	scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL,
-			       task_vtime(p), enq_flags);
+			       task_deadline(p, tctx), enq_flags);
 	__sync_fetch_and_add(&nr_shared_dispatches, 1);
 
 	/*
@@ -590,6 +616,12 @@ void BPF_STRUCT_OPS(fair_running, struct task_struct *p)
 	 * Adjust target CPU frequency before the task starts to run.
 	 */
 	update_cpuperf_target(p);
+
+	/*
+	 * Update global vruntime.
+	 */
+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+		vtime_now = p->scx.dsq_vtime;
 }
 
 /*
@@ -598,78 +630,76 @@ void BPF_STRUCT_OPS(fair_running, struct task_struct *p)
  */
 void BPF_STRUCT_OPS(fair_stopping, struct task_struct *p, bool runnable)
 {
-	u64 now = bpf_ktime_get_ns(), slice, delta_t;
+	u64 now = bpf_ktime_get_ns(), slice;
 	s32 cpu = scx_bpf_task_cpu(p);
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
 
-	cctx = try_lookup_cpu_ctx(cpu);
-	if (cctx)
-		cctx->tot_runtime += now - cctx->last_running;
-
 	tctx = try_lookup_task_ctx(p);
 	if (!tctx)
 		return;
 
-	/*
-	 * If the time slice is not fully depleted, it means that the task
-	 * voluntarily relased the CPU, therefore update the voluntary context
-	 * switch counter.
-	 *
-	 * NOTE: the sched_ext core implements sched_yield() by setting the
-	 * time slice to 0, so we won't boost the priority of tasks that are
-	 * explicitly calling sched_yield().
-	 *
-	 * This is actually a good thing, because we want to prioritize tasks
-	 * that are releasing the CPU, because they're doing I/O, waiting for
-	 * input or sending output to other tasks.
-	 *
-	 * Tasks that are using sched_yield() don't really need the priority
-	 * boost and when they get the chance to run again they will be
-	 * naturally prioritized by the vruntime-based scheduling policy.
-	 */
-	if (p->scx.slice > 0)
-		tctx->nvcsw++;
+	cctx = try_lookup_cpu_ctx(cpu);
+	if (cctx)
+		cctx->tot_runtime += now - cctx->last_running;
 
 	/*
 	 * Evaluate task's used time slice.
 	 */
 	slice = CLAMP(p->se.sum_exec_runtime - tctx->sum_exec_runtime,
 		      slice_min, slice_max);
 	tctx->sum_exec_runtime = p->se.sum_exec_runtime;
-	slice = scale_inverse_fair(p, slice);
+	slice = scale_inverse_fair(p, tctx, slice);
+	tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice);
 
 	/*
 	 * Update task's vruntime by adding the used time slice, scaled by its
 	 * priority.
 	 */
 	p->scx.dsq_vtime += slice;
+}
 
-	/*
-	 * Update global system vruntime.
-	 */
-	vtime_now += slice;
+static u64 update_freq(u64 freq, u64 delta)
+{
+	u64 new_freq;
 
-	/*
-	 * Update task's average rate of voluntary context switches per second.
-	 */
-	delta_t = (s64)(now - tctx->nvcsw_ts);
-	if (delta_t > NSEC_PER_SEC) {
-		/*
-		 * Evaluate the task's latency weight as the task's average
-		 * rate of voluntary context switches per second.
-		 */
-		u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t;
-		u64 lat_weight = MIN(avg_nvcsw, MAX_LATENCY_WEIGHT);
+	new_freq = NSEC_PER_SEC / delta;
+	return calc_avg(freq, new_freq);
+}
 
-		tctx->nvcsw = 0;
-		tctx->nvcsw_ts = now;
-		tctx->lat_weight = calc_avg(tctx->lat_weight, lat_weight);
-	}
+void BPF_STRUCT_OPS(fair_runnable, struct task_struct *p, u64 enq_flags)
+{
+	u64 now = bpf_ktime_get_ns(), delta;
+	struct task_struct *waker;
+	struct task_ctx *tctx;
+
+	waker = bpf_get_current_task_btf();
+	tctx = try_lookup_task_ctx(waker);
+	if (!tctx)
+		return;
+
+	delta = MAX(now - tctx->last_woke_at, 1);
+	tctx->waker_freq = update_freq(tctx->waker_freq, delta);
+	tctx->last_woke_at = now;
+}
+
+void BPF_STRUCT_OPS(fair_quiescent, struct task_struct *p, u64 deq_flags)
+{
+	u64 now = bpf_ktime_get_ns(), delta;
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	delta = MAX(now - tctx->last_blocked_at, 1);
+	tctx->blocked_freq = update_freq(tctx->blocked_freq, delta);
+	tctx->last_blocked_at = now;
 }
 
 void BPF_STRUCT_OPS(fair_enable, struct task_struct *p)
 {
+	u64 now = bpf_ktime_get_ns();
 	struct task_ctx *tctx;
 
 	p->scx.dsq_vtime = vtime_now;
@@ -681,7 +711,8 @@ void BPF_STRUCT_OPS(fair_enable, struct task_struct *p)
 		return;
 	}
 	tctx->sum_exec_runtime = p->se.sum_exec_runtime;
-	tctx->nvcsw_ts = bpf_ktime_get_ns();
+	tctx->last_woke_at = now;
+	tctx->last_blocked_at = now;
 }
 
 s32 BPF_STRUCT_OPS(fair_init_task, struct task_struct *p,
@@ -799,9 +830,12 @@ SCX_OPS_DEFINE(fair_ops,
 	       .dispatch		= (void *)fair_dispatch,
 	       .running			= (void *)fair_running,
 	       .stopping		= (void *)fair_stopping,
+	       .runnable		= (void *)fair_runnable,
+	       .quiescent		= (void *)fair_quiescent,
 	       .enable			= (void *)fair_enable,
 	       .init_task		= (void *)fair_init_task,
 	       .init			= (void *)fair_init,
 	       .exit			= (void *)fair_exit,
+	       .flags			= SCX_OPS_ENQ_EXITING,
 	       .timeout_ms		= 5000,
 	       .name			= "fair");