Skip to content

Commit

Permalink
scx_fair: use blocker/waker metrics to determine the latency weight
Browse files Browse the repository at this point in the history
Signed-off-by: Andrea Righi <[email protected]>
  • Loading branch information
arighi committed Nov 1, 2024
1 parent 07de4dd commit e07a97a
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 108 deletions.
238 changes: 136 additions & 102 deletions scheds/rust/scx_fair/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ const volatile bool local_kthreads;
/*
* Scheduling statistics.
*/
volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
nr_shared_dispatches, nr_migrate_dispatches;
volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches;

/*
* Exit information.
Expand Down Expand Up @@ -103,15 +102,21 @@ struct task_ctx {
u64 sum_exec_runtime;

/*
* Voluntary context switches metrics.
* Average time slice used by the task.
*/
u64 nvcsw;
u64 nvcsw_ts;
u64 avg_runtime;

/*
* Task's dynamic priority multiplier.
* Frequency with which a task is blocked (consumer).
*/
u64 lat_weight;
u64 blocked_freq;
u64 last_blocked_at;

/*
* Frequency with which a task wakes other tasks (producer).
*/
u64 waker_freq;
u64 last_woke_at;
};

/* Map that contains task-local storage. */
Expand Down Expand Up @@ -181,57 +186,65 @@ static inline bool is_kthread(const struct task_struct *p)
}

/*
* Return the dynamic priority multiplier.
* Return the amount of tasks that are waiting to run.
*/
static inline u64 nr_waiting_tasks(void)
{
return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1;
}

/*
* Return the dynamic priority multiplier (only applied in lowlatency mode).
*
* The multiplier is evaluated in function of the task's average rate of
* voluntary context switches per second.
* The dynamic priority is evaluated in function of the task's blocker and
* waker frequencies.
*/
static u64 task_dyn_prio(struct task_struct *p)
static u64 task_latency_weight(const struct task_struct *p,
const struct task_ctx *tctx)
{
struct task_ctx *tctx;
/*
* Prioritize producers (waker_freq) more than consumers
* (blocked_freq).
*/
u64 lat_weight = calc_avg(tctx->waker_freq, tctx->blocked_freq);

tctx = try_lookup_task_ctx(p);
if (!tctx)
return 1;
return MAX(tctx->lat_weight, 1);
return CLAMP(lat_weight, 1, MAX_LATENCY_WEIGHT);
}

/*
* Return task's dynamic priority.
*/
static u64 task_prio(struct task_struct *p)
static u64 task_prio(const struct task_struct *p, const struct task_ctx *tctx)
{
return p->scx.weight * task_dyn_prio(p);
u64 prio = p->scx.weight * task_latency_weight(p, tctx);

return CLAMP(prio, 1, 10000);
}

/*
* Return the task's allowed lag: used to determine how early its deadline it
* can be.
* Return a value inversely proportional to the task's priority.
*/
static u64 task_lag(struct task_struct *p)
static u64 scale_inverse_fair(const struct task_struct *p,
const struct task_ctx *tctx, u64 value)
{
return MIN(slice_lag * task_prio(p) / 100, NSEC_PER_SEC);
return value * 100 / task_prio(p, tctx);
}

/*
* Return a value inversely proportional to the task's weight.
* Evaluate the task's deadline component.
*/
static inline u64 scale_inverse_fair(struct task_struct *p, u64 value)
static u64 task_compute_dl(const struct task_struct *p,
const struct task_ctx *tctx)
{
return value * 100 / task_prio(p, tctx);
return scale_inverse_fair(p, tctx, tctx->avg_runtime);
}

/*
* Return task's evaluated deadline.
*/
static inline u64 task_vtime(struct task_struct *p)
static u64 task_deadline(struct task_struct *p, const struct task_ctx *tctx)
{
u64 min_vruntime = vtime_now - task_lag(p);
struct task_ctx *tctx;

tctx = try_lookup_task_ctx(p);
if (!tctx)
return min_vruntime;
u64 min_vruntime = vtime_now - slice_lag;

/*
* Limit the vruntime to vtime_now minus the maximum task's lag to
Expand All @@ -240,24 +253,18 @@ static inline u64 task_vtime(struct task_struct *p)
if (vtime_before(p->scx.dsq_vtime, min_vruntime))
p->scx.dsq_vtime = min_vruntime;

return p->scx.dsq_vtime;
}

static inline u64 nr_waiting_tasks(void)
{
return scx_bpf_dsq_nr_queued(SHARED_DSQ);
return p->scx.dsq_vtime + task_compute_dl(p, tctx);
}

/*
* Evaluate task's time slice in function of the total amount of tasks that are
* waiting to be dispatched and the task's weight.
*/
static inline void task_refill_slice(struct task_struct *p)
static void task_refill_slice(struct task_struct *p)
{
u64 slice, nr_waiting = nr_waiting_tasks();
u64 nr_waiting = nr_waiting_tasks();

slice = slice_max / (nr_waiting + 1);
p->scx.slice = CLAMP(slice, slice_min, slice_max);
p->scx.slice = CLAMP(slice_max / nr_waiting, slice_min, slice_max);
}

/*
Expand Down Expand Up @@ -470,27 +477,42 @@ s32 BPF_STRUCT_OPS(fair_select_cpu, struct task_struct *p,
return cpu;
}

/*
* If we couldn't find any idle CPU, return any CPU in the task's
* domain.
*
* There is not guarantee that the task will be dispatched on this CPU,
* but it can help to wake-up in advance a CPU suitable for the task.
*/
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);

return cpu >= 0 ? cpu : prev_cpu;
return prev_cpu;
}

/*
* Wake up an idle CPU for task @p.
*/
static void kick_task_cpu(struct task_struct *p)
{
s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
s32 cpu = scx_bpf_task_cpu(p);

/*
* If the task isn't allowed to use its previously used CPU try to pick
* any random idle CPU in its new allowed CPU domain.
*/
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
return;
}

/*
* For tasks that can run only on a single CPU, we can simply verify if
* their only allowed CPU is idle.
*/
if (p->nr_cpus_allowed == 1 || p->migration_disabled) {
if (scx_bpf_test_and_clear_cpu_idle(cpu))
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
}

/*
* Otherwise, use the regular idle CPU selection logic.
*/
cpu = pick_idle_cpu(p, cpu, 0);
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, 0);
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
}

/*
Expand All @@ -499,7 +521,11 @@ static void kick_task_cpu(struct task_struct *p)
*/
void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags)
{
s32 cpu;
struct task_ctx *tctx;

tctx = try_lookup_task_ctx(p);
if (!tctx)
return;

/*
* Per-CPU kthreads can be critical for system responsiveness, when
Expand All @@ -508,7 +534,7 @@ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags)
*/
if (local_kthreads && is_kthread(p) && p->nr_cpus_allowed == 1) {
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
enq_flags);
enq_flags | SCX_ENQ_PREEMPT);
__sync_fetch_and_add(&nr_kthread_dispatches, 1);
return;
}
Expand All @@ -518,7 +544,7 @@ void BPF_STRUCT_OPS(fair_enqueue, struct task_struct *p, u64 enq_flags)
* the first CPU that becomes available.
*/
scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL,
task_vtime(p), enq_flags);
task_deadline(p, tctx), enq_flags);
__sync_fetch_and_add(&nr_shared_dispatches, 1);

/*
Expand Down Expand Up @@ -590,6 +616,12 @@ void BPF_STRUCT_OPS(fair_running, struct task_struct *p)
* Adjust target CPU frequency before the task starts to run.
*/
update_cpuperf_target(p);

/*
* Update global vruntime.
*/
if (vtime_before(vtime_now, p->scx.dsq_vtime))
vtime_now = p->scx.dsq_vtime;
}

/*
Expand All @@ -598,78 +630,76 @@ void BPF_STRUCT_OPS(fair_running, struct task_struct *p)
*/
void BPF_STRUCT_OPS(fair_stopping, struct task_struct *p, bool runnable)
{
u64 now = bpf_ktime_get_ns(), slice, delta_t;
u64 now = bpf_ktime_get_ns(), slice;
s32 cpu = scx_bpf_task_cpu(p);
struct cpu_ctx *cctx;
struct task_ctx *tctx;

cctx = try_lookup_cpu_ctx(cpu);
if (cctx)
cctx->tot_runtime += now - cctx->last_running;

tctx = try_lookup_task_ctx(p);
if (!tctx)
return;

/*
* If the time slice is not fully depleted, it means that the task
* voluntarily relased the CPU, therefore update the voluntary context
* switch counter.
*
* NOTE: the sched_ext core implements sched_yield() by setting the
* time slice to 0, so we won't boost the priority of tasks that are
* explicitly calling sched_yield().
*
* This is actually a good thing, because we want to prioritize tasks
* that are releasing the CPU, because they're doing I/O, waiting for
* input or sending output to other tasks.
*
* Tasks that are using sched_yield() don't really need the priority
* boost and when they get the chance to run again they will be
* naturally prioritized by the vruntime-based scheduling policy.
*/
if (p->scx.slice > 0)
tctx->nvcsw++;
cctx = try_lookup_cpu_ctx(cpu);
if (cctx)
cctx->tot_runtime += now - cctx->last_running;

/*
* Evaluate task's used time slice.
*/
slice = CLAMP(p->se.sum_exec_runtime - tctx->sum_exec_runtime,
slice_min, slice_max);
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
slice = scale_inverse_fair(p, slice);
slice = scale_inverse_fair(p, tctx, slice);
tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice);

/*
* Update task's vruntime by adding the used time slice, scaled by its
* priority.
*/
p->scx.dsq_vtime += slice;
}

/*
* Update global system vruntime.
*/
vtime_now += slice;
static u64 update_freq(u64 freq, u64 delta)
{
u64 new_freq;

/*
* Update task's average rate of voluntary context switches per second.
*/
delta_t = (s64)(now - tctx->nvcsw_ts);
if (delta_t > NSEC_PER_SEC) {
/*
* Evaluate the task's latency weight as the task's average
* rate of voluntary context switches per second.
*/
u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t;
u64 lat_weight = MIN(avg_nvcsw, MAX_LATENCY_WEIGHT);
new_freq = NSEC_PER_SEC / delta;
return calc_avg(freq, new_freq);
}

tctx->nvcsw = 0;
tctx->nvcsw_ts = now;
tctx->lat_weight = calc_avg(tctx->lat_weight, lat_weight);
}
void BPF_STRUCT_OPS(fair_runnable, struct task_struct *p, u64 enq_flags)
{
u64 now = bpf_ktime_get_ns(), delta;
struct task_struct *waker;
struct task_ctx *tctx;

waker = bpf_get_current_task_btf();
tctx = try_lookup_task_ctx(waker);
if (!tctx)
return;

delta = MAX(now - tctx->last_woke_at, 1);
tctx->waker_freq = update_freq(tctx->waker_freq, delta);
tctx->last_woke_at = now;
}

void BPF_STRUCT_OPS(fair_quiescent, struct task_struct *p, u64 deq_flags)
{
u64 now = bpf_ktime_get_ns(), delta;
struct task_ctx *tctx;

tctx = try_lookup_task_ctx(p);
if (!tctx)
return;

delta = MAX(now - tctx->last_blocked_at, 1);
tctx->blocked_freq = update_freq(tctx->blocked_freq, delta);
tctx->last_blocked_at = now;
}

void BPF_STRUCT_OPS(fair_enable, struct task_struct *p)
{
u64 now = bpf_ktime_get_ns();
struct task_ctx *tctx;

p->scx.dsq_vtime = vtime_now;
Expand All @@ -681,7 +711,8 @@ void BPF_STRUCT_OPS(fair_enable, struct task_struct *p)
return;
}
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
tctx->nvcsw_ts = bpf_ktime_get_ns();
tctx->last_woke_at = now;
tctx->last_blocked_at = now;
}

s32 BPF_STRUCT_OPS(fair_init_task, struct task_struct *p,
Expand Down Expand Up @@ -799,9 +830,12 @@ SCX_OPS_DEFINE(fair_ops,
.dispatch = (void *)fair_dispatch,
.running = (void *)fair_running,
.stopping = (void *)fair_stopping,
.runnable = (void *)fair_runnable,
.quiescent = (void *)fair_quiescent,
.enable = (void *)fair_enable,
.init_task = (void *)fair_init_task,
.init = (void *)fair_init,
.exit = (void *)fair_exit,
.flags = SCX_OPS_ENQ_EXITING,
.timeout_ms = 5000,
.name = "fair");
Loading

0 comments on commit e07a97a

Please sign in to comment.