From 76f78ff6e06b6b24e2d05724fcfa730f2574d38e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:04 -1000
Subject: [PATCH 001/304] cgroup: Implement cgroup_show_cftypes()

Implement cgroup_show_cftypes() which shows and hides all cgroup files
associated with the specified set of cgroup file types. CFTYPE_HIDDEN flag
is added so that files can be created hidden from the get-go.

cgroup_show_cftypes() can be used whether the cftypes are added or not. It
also combines with cgroup_show_file() so that a given file is visible iff
both its cftype and cfile are visible.

This will be used by a new sched_class to selectively show and hide CPU
controller interface files depending on whether they're supported.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/cgroup-defs.h |  8 +++
 include/linux/cgroup.h      |  1 +
 kernel/cgroup/cgroup.c      | 97 ++++++++++++++++++++++++++++++++++---
 3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a0d5466c7be1..8af1e7d487cbf 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -127,12 +127,18 @@ enum {
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
 
+	CFTYPE_HIDDEN		= (1 << 6),	/* file type hidden, see cgroup_show_cftypes() */
+
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
 	__CFTYPE_ADDED		= (1 << 18),
 };
 
+enum cfile_flags {
+	CFILE_HIDDEN		= (1 << 0),	/* file instance hidden */
+};
+
 /*
  * cgroup_file is the handle for a file instance created in a cgroup which
  * is used, for example, to generate file changed notifications.  This can
@@ -140,7 +146,9 @@ enum {
  */
 struct cgroup_file {
 	/* do not access any fields from outside cgroup core */
+	struct cftype *cft;
 	struct kernfs_node *kn;
+	unsigned int flags;
 	unsigned long notified_at;
 	struct timer_list notify_timer;
 };
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aecffdb47..a8c6982c2c24d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -115,6 +115,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
+void cgroup_show_cftype(struct cftype *cft, bool show);
 void cgroup_file_notify(struct cgroup_file *cfile);
 void cgroup_file_show(struct cgroup_file *cfile, bool show);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8a5294f4ce720..f762bfb78f4a4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4206,10 +4206,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 		return ret;
 	}
 
+	kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN));
+
 	if (cft->file_offset) {
 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
 
 		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+		cfile->cft = cft;
 
 		spin_lock_irq(&cgroup_file_kn_lock);
 		cfile->kn = kn;
@@ -4488,6 +4491,24 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
 
+static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile)
+{
+	struct kernfs_node *kn;
+
+	spin_lock_irq(&cgroup_file_kn_lock);
+	kn = cfile->kn;
+	kernfs_get(kn);
+	spin_unlock_irq(&cgroup_file_kn_lock);
+
+	return kn;
+}
+
+static bool cfile_visible(struct cgroup_file *cfile)
+{
+	return !(cfile->cft->flags & CFTYPE_HIDDEN) &&
+		!(cfile->flags & CFILE_HIDDEN);
+}
+
 /**
  * cgroup_file_show - show or hide a hidden cgroup file
  * @cfile: target cgroup_file obtained by setting cftype->file_offset
@@ -4497,15 +4518,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
 {
 	struct kernfs_node *kn;
 
-	spin_lock_irq(&cgroup_file_kn_lock);
-	kn = cfile->kn;
-	kernfs_get(kn);
-	spin_unlock_irq(&cgroup_file_kn_lock);
+	mutex_lock(&cgroup_mutex);
 
-	if (kn)
-		kernfs_show(kn, show);
+	if (show)
+		cfile->flags &= ~CFILE_HIDDEN;
+	else
+		cfile->flags |= CFILE_HIDDEN;
 
-	kernfs_put(kn);
+	kn = cfile_kn_get(cfile);
+	if (kn) {
+		kernfs_show(kn, cfile_visible(cfile));
+		kernfs_put(kn);
+	}
+
+	mutex_unlock(&cgroup_mutex);
 }
 
 /**
@@ -5519,6 +5545,63 @@ static void offline_css(struct cgroup_subsys_state *css)
 	wake_up_all(&css->cgroup->offline_waitq);
 }
 
+/**
+ * cgroup_show_cftype - show or hide a cgroup file type
+ * @cft: cftype to show or hide
+ * @show: whether to show or hide
+ *
+ * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show.
+ * @cft may or may not be added at the time of this call. After hiding, it's
+ * guaranteed that there are no in-flight operations on the hidden files.
+ */
+void cgroup_show_cftype(struct cftype *cft, bool show)
+{
+	struct cgroup_subsys *ss = cft->ss;
+	struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp;
+	struct cgroup_subsys_state *css;
+
+	mutex_lock(&cgroup_mutex);
+
+	if (show)
+		cft->flags &= ~CFTYPE_HIDDEN;
+	else
+		cft->flags |= CFTYPE_HIDDEN;
+
+	if (!(cft->flags & __CFTYPE_ADDED))
+		goto out_unlock;
+
+	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+		struct cgroup *cgrp = css->cgroup;
+		struct kernfs_node *kn;
+
+		if (!(css->flags & CSS_VISIBLE))
+			continue;
+
+		if (cft->file_offset) {
+			struct cgroup_file *cfile =
+				(void *)css + cft->file_offset;
+
+			kn = cfile_kn_get(cfile);
+			if (kn) {
+				kernfs_show(kn, cfile_visible(cfile));
+				kernfs_put(kn);
+			}
+		} else {
+			char buf[CGROUP_FILE_NAME_MAX];
+
+			kn = kernfs_find_and_get(cgrp->kn,
+					cgroup_file_name(cgrp, cft, buf));
+			if (kn) {
+				kernfs_show(kn, show);
+				kernfs_put(kn);
+			}
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+}
+
 /**
  * css_create - create a cgroup_subsys_state
  * @cgrp: the cgroup new css will be associated with

From 442936262688f0ead10291cd29bed3ef436b1bef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:04 -1000
Subject: [PATCH 002/304] sched: Encapsulate task attribute change sequence
 into a helper macro

A task needs to be dequeued and put before an attribute change and then
restored afterwards. This is currently open-coded in multiple places. This
patch encapsulates the preparation and restoration sequences into
SCHED_CHANGE_BLOCK which allows the actual attribute changes to be put
inside its nested block. While the conversions are generally
straightforward, there are some subtleties:

* If a variable is specified for the flags argument, it can be modified from
  inside the block body to allow using a different flags value for
  re-enqueueing. This is used by rt_mutex_setprio() and
  __sched_setscheduler().

* __sched_setscheduler() used to only set ENQUEUE_HEAD if the task is
  queued. After the conversion, it sets the flag whether the task is queued
  or not. This doesn't cause any behavioral differences and is simpler than
  accessing the internal state of the helper.

* In a similar vein, sched_move_task() tests task_current() again after the
  change block instead of carrying over the test result from inside the
  change block.

This patch is adopted from Peter Zijlstra's draft patch linked below. The
changes are:

* Call fini explicitly from for() instead of using the __cleanup__
  attribute.

* Allow the queue flag variable to be modified directly so that the user
  doesn't have to poke into sched_change_guard struct. Also, in the original
  patch, rt_mutex_setprio() was incorrectly updating its queue_flag instead
  of cg.flags.

* Some cosmetic changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Original-patch-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/all/20220330162228.GH14330@worktop.programming.kicks-ass.net/T/#u
Reviewed-by: David Vernet <dvernet@meta.com>
---
 kernel/sched/core.c | 260 ++++++++++++++++++++++----------------------
 1 file changed, 130 insertions(+), 130 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index af017e038b482..fb080ca54d808 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2096,6 +2096,76 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_task(rq, p, flags);
 }
 
+struct sched_change_guard {
+	struct task_struct	*p;
+	struct rq		*rq;
+	bool			queued;
+	bool			running;
+	bool			done;
+};
+
+static struct sched_change_guard
+sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct sched_change_guard cg = {
+		.rq = rq,
+		.p = p,
+		.queued = task_on_rq_queued(p),
+		.running = task_current(rq, p),
+	};
+
+	if (cg.queued) {
+		/*
+		 * __kthread_bind() may call this on blocked tasks without
+		 * holding rq->lock through __do_set_cpus_allowed(). Assert @rq
+		 * locked iff @p is queued.
+		 */
+		lockdep_assert_rq_held(rq);
+		dequeue_task(rq, p, flags);
+	}
+	if (cg.running)
+		put_prev_task(rq, p);
+
+	return cg;
+}
+
+static void sched_change_guard_fini(struct sched_change_guard *cg, int flags)
+{
+	if (cg->queued)
+		enqueue_task(cg->rq, cg->p, flags | ENQUEUE_NOCLOCK);
+	if (cg->running)
+		set_next_task(cg->rq, cg->p);
+	cg->done = true;
+}
+
+/**
+ * SCHED_CHANGE_BLOCK - Nested block for task attribute updates
+ * @__rq: Runqueue the target task belongs to
+ * @__p: Target task
+ * @__flags: DEQUEUE/ENQUEUE_* flags
+ *
+ * A task may need to be dequeued and put_prev_task'd for attribute updates and
+ * set_next_task'd and re-enqueued afterwards. This helper defines a nested
+ * block which automatically handles these preparation and cleanup operations.
+ *
+ *  SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ *	  update_attribute(p);
+ *        ...
+ *  }
+ *
+ * If @__flags is a variable, the variable may be updated in the block body and
+ * the updated value will be used when re-enqueueing @p.
+ *
+ * If %DEQUEUE_NOCLOCK is specified, the caller is responsible for calling
+ * update_rq_clock() beforehand. Otherwise, the rq clock is automatically
+ * updated iff the task needs to be dequeued and re-enqueued. Only the former
+ * case guarantees that the rq clock is up-to-date inside and after the block.
+ */
+#define SCHED_CHANGE_BLOCK(__rq, __p, __flags)					\
+	for (struct sched_change_guard __cg =					\
+			sched_change_guard_init(__rq, __p, __flags);		\
+	     !__cg.done; sched_change_guard_fini(&__cg, __flags))
+
 static inline int __normal_prio(int policy, int rt_prio, int nice)
 {
 	int prio;
@@ -2554,7 +2624,6 @@ static void
 __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 {
 	struct rq *rq = task_rq(p);
-	bool queued, running;
 
 	/*
 	 * This here violates the locking rules for affinity, since we're only
@@ -2573,26 +2642,9 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 	else
 		lockdep_assert_held(&p->pi_lock);
 
-	queued = task_on_rq_queued(p);
-	running = task_current(rq, p);
-
-	if (queued) {
-		/*
-		 * Because __kthread_bind() calls this on blocked tasks without
-		 * holding rq->lock.
-		 */
-		lockdep_assert_rq_held(rq);
-		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+		p->sched_class->set_cpus_allowed(p, ctx);
 	}
-	if (running)
-		put_prev_task(rq, p);
-
-	p->sched_class->set_cpus_allowed(p, ctx);
-
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
 }
 
 /*
@@ -6989,7 +7041,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
  */
 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int prio, oldprio, queued, running, queue_flag =
+	int prio, oldprio, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class;
 	struct rq_flags rf;
@@ -7049,49 +7101,39 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
-	queued = task_on_rq_queued(p);
-	running = task_current(rq, p);
-	if (queued)
-		dequeue_task(rq, p, queue_flag);
-	if (running)
-		put_prev_task(rq, p);
-
-	/*
-	 * Boosting condition are:
-	 * 1. -rt task is running and holds mutex A
-	 *      --> -dl task blocks on mutex A
-	 *
-	 * 2. -dl task is running and holds mutex A
-	 *      --> -dl task blocks on mutex A and could preempt the
-	 *          running task
-	 */
-	if (dl_prio(prio)) {
-		if (!dl_prio(p->normal_prio) ||
-		    (pi_task && dl_prio(pi_task->prio) &&
-		     dl_entity_preempt(&pi_task->dl, &p->dl))) {
-			p->dl.pi_se = pi_task->dl.pi_se;
-			queue_flag |= ENQUEUE_REPLENISH;
+	SCHED_CHANGE_BLOCK(rq, p, queue_flag) {
+		/*
+		 * Boosting condition are:
+		 * 1. -rt task is running and holds mutex A
+		 *      --> -dl task blocks on mutex A
+		 *
+		 * 2. -dl task is running and holds mutex A
+		 *      --> -dl task blocks on mutex A and could preempt the
+		 *          running task
+		 */
+		if (dl_prio(prio)) {
+			if (!dl_prio(p->normal_prio) ||
+			    (pi_task && dl_prio(pi_task->prio) &&
+			     dl_entity_preempt(&pi_task->dl, &p->dl))) {
+				p->dl.pi_se = pi_task->dl.pi_se;
+				queue_flag |= ENQUEUE_REPLENISH;
+			} else {
+				p->dl.pi_se = &p->dl;
+			}
+		} else if (rt_prio(prio)) {
+			if (dl_prio(oldprio))
+				p->dl.pi_se = &p->dl;
+			if (oldprio < prio)
+				queue_flag |= ENQUEUE_HEAD;
 		} else {
-			p->dl.pi_se = &p->dl;
+			if (dl_prio(oldprio))
+				p->dl.pi_se = &p->dl;
+			if (rt_prio(oldprio))
+				p->rt.timeout = 0;
 		}
-	} else if (rt_prio(prio)) {
-		if (dl_prio(oldprio))
-			p->dl.pi_se = &p->dl;
-		if (oldprio < prio)
-			queue_flag |= ENQUEUE_HEAD;
-	} else {
-		if (dl_prio(oldprio))
-			p->dl.pi_se = &p->dl;
-		if (rt_prio(oldprio))
-			p->rt.timeout = 0;
-	}
-
-	__setscheduler_prio(p, prio);
 
-	if (queued)
-		enqueue_task(rq, p, queue_flag);
-	if (running)
-		set_next_task(rq, p);
+		__setscheduler_prio(p, prio);
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -7113,7 +7155,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	bool queued, running;
 	int old_prio;
 	struct rq_flags rf;
 	struct rq *rq;
@@ -7137,22 +7178,13 @@ void set_user_nice(struct task_struct *p, long nice)
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	queued = task_on_rq_queued(p);
-	running = task_current(rq, p);
-	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-	if (running)
-		put_prev_task(rq, p);
-
-	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p, true);
-	old_prio = p->prio;
-	p->prio = effective_prio(p);
 
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
+	SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		set_load_weight(p, true);
+		old_prio = p->prio;
+		p->prio = effective_prio(p);
+	}
 
 	/*
 	 * If the task increased its priority or is running and
@@ -7536,7 +7568,7 @@ static int __sched_setscheduler(struct task_struct *p,
 				bool user, bool pi)
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
-	int retval, oldprio, newprio, queued, running;
+	int retval, oldprio, newprio;
 	const struct sched_class *prev_class;
 	struct balance_callback *head;
 	struct rq_flags rf;
@@ -7701,33 +7733,22 @@ static int __sched_setscheduler(struct task_struct *p,
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
-	queued = task_on_rq_queued(p);
-	running = task_current(rq, p);
-	if (queued)
-		dequeue_task(rq, p, queue_flags);
-	if (running)
-		put_prev_task(rq, p);
-
-	prev_class = p->sched_class;
+	SCHED_CHANGE_BLOCK(rq, p, queue_flags) {
+		prev_class = p->sched_class;
 
-	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
-		__setscheduler_params(p, attr);
-		__setscheduler_prio(p, newprio);
-	}
-	__setscheduler_uclamp(p, attr);
+		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+			__setscheduler_params(p, attr);
+			__setscheduler_prio(p, newprio);
+		}
+		__setscheduler_uclamp(p, attr);
 
-	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
 		if (oldprio < p->prio)
 			queue_flags |= ENQUEUE_HEAD;
-
-		enqueue_task(rq, p, queue_flags);
 	}
-	if (running)
-		set_next_task(rq, p);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 
@@ -9250,25 +9271,15 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-	bool queued, running;
 	struct rq_flags rf;
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &rf);
-	queued = task_on_rq_queued(p);
-	running = task_current(rq, p);
 
-	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
-	if (running)
-		put_prev_task(rq, p);
-
-	p->numa_preferred_nid = nid;
+	SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE) {
+		p->numa_preferred_nid = nid;
+	}
 
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
 	task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -10360,35 +10371,24 @@ static void sched_change_group(struct task_struct *tsk)
  */
 void sched_move_task(struct task_struct *tsk)
 {
-	int queued, running, queue_flags =
-		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct rq_flags rf;
 	struct rq *rq;
 
 	rq = task_rq_lock(tsk, &rf);
 	update_rq_clock(rq);
 
-	running = task_current(rq, tsk);
-	queued = task_on_rq_queued(tsk);
-
-	if (queued)
-		dequeue_task(rq, tsk, queue_flags);
-	if (running)
-		put_prev_task(rq, tsk);
-
-	sched_change_group(tsk);
+	SCHED_CHANGE_BLOCK(rq, tsk,
+			   DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		sched_change_group(tsk);
+	}
 
-	if (queued)
-		enqueue_task(rq, tsk, queue_flags);
-	if (running) {
-		set_next_task(rq, tsk);
-		/*
-		 * After changing group, the running task may have joined a
-		 * throttled one but it's still the running task. Trigger a
-		 * resched to make sure that task can still run.
-		 */
+	/*
+	 * After changing group, the running task may have joined a throttled
+	 * one but it's still the running task. Trigger a resched to make sure
+	 * that task can still run.
+	 */
+	if (task_current(rq, tsk))
 		resched_curr(rq);
-	}
 
 	task_rq_unlock(rq, tsk, &rf);
 }

From 7991cab6aa3190f265387c044a7c27ae6ba61e1d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 003/304] sched: Restructure sched_class order sanity checks in
 sched_init()

Currently, sched_init() checks that the sched_class'es are in the expected
order by testing each adjacency which is a bit brittle and makes it
cumbersome to add optional sched_class'es. Instead, let's verify whether
they're in the expected order using sched_class_above() which is what
matters.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 kernel/sched/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb080ca54d808..efac96fd6cfd6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9794,12 +9794,12 @@ void __init sched_init(void)
 	int i;
 
 	/* Make sure the linker didn't screw up */
-	BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
-	       &fair_sched_class != &rt_sched_class + 1 ||
-	       &rt_sched_class   != &dl_sched_class + 1);
 #ifdef CONFIG_SMP
-	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
+	BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
 #endif
+	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
+	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
+	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
 
 	wait_bit_init();
 

From e1460a8adaa038856cf32025a78dc577a0b58d3d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 004/304] sched: Allow sched_cgroup_fork() to fail and
 introduce sched_cancel_fork()

A new BPF extensible sched_class will need more control over the forking
process. It wants to be able to fail from sched_cgroup_fork() after the new
task's sched_task_group is initialized so that the loaded BPF program can
prepare the task with its cgroup association is established and reject fork
if e.g. allocation fails.

Allow sched_cgroup_fork() to fail by making it return int instead of void
and adding sched_cancel_fork() to undo sched_fork() in the error path.

sched_cgroup_fork() doesn't fail yet and this patch shouldn't cause any
behavior changes.

v2: Patch description updated to detail the expected use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/task.h |  3 ++-
 kernel/fork.c              | 15 ++++++++++-----
 kernel/sched/core.c        |  8 +++++++-
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 357e0068497c1..dcff721170c30 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -58,7 +58,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_cancel_fork(struct task_struct *p);
 extern void sched_post_fork(struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f68954d05e89d..0d166537a1a37 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2242,7 +2242,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 	retval = perf_event_init_task(p, clone_flags);
 	if (retval)
-		goto bad_fork_cleanup_policy;
+		goto bad_fork_sched_cancel_fork;
 	retval = audit_alloc(p);
 	if (retval)
 		goto bad_fork_cleanup_perf;
@@ -2383,7 +2383,9 @@ static __latent_entropy struct task_struct *copy_process(
 	 * cgroup specific, it unconditionally needs to place the task on a
 	 * runqueue.
 	 */
-	sched_cgroup_fork(p, args);
+	retval = sched_cgroup_fork(p, args);
+	if (retval)
+		goto bad_fork_cancel_cgroup;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2429,13 +2431,13 @@ static __latent_entropy struct task_struct *copy_process(
 	/* Don't start children in a dying pid namespace */
 	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
-		goto bad_fork_cancel_cgroup;
+		goto bad_fork_core_free;
 	}
 
 	/* Let kill terminate clone/fork in the middle */
 	if (fatal_signal_pending(current)) {
 		retval = -EINTR;
-		goto bad_fork_cancel_cgroup;
+		goto bad_fork_core_free;
 	}
 
 	/* No more failure paths after this point. */
@@ -2510,10 +2512,11 @@ static __latent_entropy struct task_struct *copy_process(
 
 	return p;
 
-bad_fork_cancel_cgroup:
+bad_fork_core_free:
 	sched_core_free(p);
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p, args);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD) {
@@ -2552,6 +2555,8 @@ static __latent_entropy struct task_struct *copy_process(
 	audit_free(p);
 bad_fork_cleanup_perf:
 	perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+	sched_cancel_fork(p);
 bad_fork_cleanup_policy:
 	lockdep_free_task(p);
 #ifdef CONFIG_NUMA
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index efac96fd6cfd6..fdf4dba12a7ef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4768,7 +4768,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	return 0;
 }
 
-void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 {
 	unsigned long flags;
 
@@ -4795,6 +4795,12 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return 0;
+}
+
+void sched_cancel_fork(struct task_struct *p)
+{
 }
 
 void sched_post_fork(struct task_struct *p)

From 52428c35b073b80f7a012f07e0d40e9e8f7ae228 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 005/304] sched: Add sched_class->reweight_task()

Currently, during a task weight change, sched core directly calls
reweight_task() defined in fair.c if @p is on CFS. Let's make it a proper
sched_class operation instead. CFS's reweight_task() is renamed to
reweight_task_fair() and now called through sched_class.

While it turns a direct call into an indirect one, set_load_weight() isn't
called from a hot path and this change shouldn't cause any noticeable
difference. This will be used to implement reweight_task for a new BPF
extensible sched_class so that it can keep its cached task weight
up-to-date.

This will be used by a new sched_class to track weight changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 4 ++--
 kernel/sched/fair.c  | 3 ++-
 kernel/sched/sched.h | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fdf4dba12a7ef..5a6f4884384e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1277,8 +1277,8 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	 * SCHED_OTHER tasks have to update their load when changing their
 	 * weight
 	 */
-	if (update_load && p->sched_class == &fair_sched_class) {
-		reweight_task(p, prio);
+	if (update_load && p->sched_class->reweight_task) {
+		p->sched_class->reweight_task(task_rq(p), p, prio);
 	} else {
 		load->weight = scale_load(sched_prio_to_weight[prio]);
 		load->inv_weight = sched_prio_to_wmult[prio];
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b963..681ab0dd0bc17 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3342,7 +3342,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 }
 
-void reweight_task(struct task_struct *p, int prio)
+static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -12547,6 +12547,7 @@ DEFINE_SCHED_CLASS(fair) = {
 	.task_tick		= task_tick_fair,
 	.task_fork		= task_fork_fair,
 
+	.reweight_task		= reweight_task_fair,
 	.prio_changed		= prio_changed_fair,
 	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3e8df6d31c1e3..7934b597053df 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2213,6 +2213,8 @@ struct sched_class {
 	 */
 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+			      int newprio);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			      int oldprio);
 
@@ -2365,8 +2367,6 @@ extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
-extern void reweight_task(struct task_struct *p, int prio);
-
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 

From 8197857b3b88ed63eb17dbe7c83693973846e508 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 006/304] sched: Add sched_class->switching_to() and expose
 check_class_changing/changed()

When a task switches to a new sched_class, the prev and new classes are
notified through ->switched_from() and ->switched_to(), respectively, after
the switching is done.

A new BPF extensible sched_class will have callbacks that allow the BPF
scheduler to keep track of relevant task states (like priority and cpumask).
Those callbacks aren't called while a task is on a different sched_class.
When a task comes back, we wanna tell the BPF progs the up-to-date state
before the task gets enqueued, so we need a hook which is called before the
switching is committed.

This patch adds ->switching_to() which is called during sched_class switch
through check_class_changing() before the task is restored. Also, this patch
exposes check_class_changing/changed() in kernel/sched/sched.h. They will be
used by the new BPF extensible sched_class to implement implicit sched_class
switching which is used e.g. when falling back to CFS when the BPF scheduler
fails or unloads.

This is a prep patch and doesn't cause any behavior changes. The new
operation and exposed functions aren't used yet.

v2: Improve patch description w/ details on planned use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 20 +++++++++++++++++---
 kernel/sched/sched.h |  7 +++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5a6f4884384e3..a378e8e090610 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2223,6 +2223,17 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+/*
+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
+ * mess with locking.
+ */
+void check_class_changing(struct rq *rq, struct task_struct *p,
+			  const struct sched_class *prev_class)
+{
+	if (prev_class != p->sched_class && p->sched_class->switching_to)
+		p->sched_class->switching_to(rq, p);
+}
+
 /*
  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
  * use the balance_callback list if you want balancing.
@@ -2230,9 +2241,9 @@ inline int task_curr(const struct task_struct *p)
  * this means any call to check_class_changed() must be followed by a call to
  * balance_callback().
  */
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
-				       const struct sched_class *prev_class,
-				       int oldprio)
+void check_class_changed(struct rq *rq, struct task_struct *p,
+			 const struct sched_class *prev_class,
+			 int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
@@ -7139,6 +7150,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		}
 
 		__setscheduler_prio(p, prio);
+		check_class_changing(rq, p, prev_class);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -7748,6 +7760,8 @@ static int __sched_setscheduler(struct task_struct *p,
 		}
 		__setscheduler_uclamp(p, attr);
 
+		check_class_changing(rq, p, prev_class);
+
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7934b597053df..1545779c5db87 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2211,6 +2211,7 @@ struct sched_class {
 	 * cannot assume the switched_from/switched_to pair is serialized by
 	 * rq->lock. They are however serialized by p->pi_lock.
 	 */
+	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
@@ -2447,6 +2448,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
+extern void check_class_changing(struct rq *rq, struct task_struct *p,
+				 const struct sched_class *prev_class);
+extern void check_class_changed(struct rq *rq, struct task_struct *p,
+				const struct sched_class *prev_class,
+				int oldprio);
+
 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 
 #ifdef CONFIG_PREEMPT_RT

From 842682961cb9f42f649093dece484dcfbfeb09a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 007/304] sched: Factor out cgroup weight conversion functions

Factor out sched_weight_from/to_cgroup() which convert between scheduler
shares and cgroup weight. No functional change. The factored out functions
will be used by a new BPF extensible sched_class so that the weights can be
exposed to the BPF programs in a way which is consistent cgroup weights and
easier to interpret.

The weight conversions will be used regardless of cgroup usage. It's just
borrowing the cgroup weight range as it's more intuitive.
CGROUP_WEIGHT_MIN/DFL/MAX constants are moved outside CONFIG_CGROUPS so that
the conversion helpers can always be defined.

v2: The helpers are now defined regardless of COFNIG_CGROUPS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/cgroup.h |  4 ++--
 kernel/sched/core.c    | 28 +++++++++++++---------------
 kernel/sched/sched.h   | 18 ++++++++++++++++++
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a8c6982c2c24d..5080dfc8ee48e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,8 +29,6 @@
 
 struct kernel_clone_args;
 
-#ifdef CONFIG_CGROUPS
-
 /*
  * All weight knobs on the default hierarchy should use the following min,
  * default and max values.  The default value is the logarithmic center of
@@ -40,6 +38,8 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
+#ifdef CONFIG_CGROUPS
+
 /* walk only threadgroup leaders */
 #define CSS_TASK_ITER_PROCS		(1U << 0)
 /* walk all threaded css_sets in the domain */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a378e8e090610..fc70080952491 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11145,29 +11145,27 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static unsigned long tg_weight(struct task_group *tg)
+{
+	return scale_load_down(tg->shares);
+}
+
 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
-	struct task_group *tg = css_tg(css);
-	u64 weight = scale_load_down(tg->shares);
-
-	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+	return sched_weight_to_cgroup(tg_weight(css_tg(css)));
 }
 
 static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
-				struct cftype *cft, u64 weight)
+				struct cftype *cft, u64 cgrp_weight)
 {
-	/*
-	 * cgroup weight knobs should use the common MIN, DFL and MAX
-	 * values which are 1, 100 and 10000 respectively.  While it loses
-	 * a bit of range on both ends, it maps pretty well onto the shares
-	 * value used by scheduler and the round-trip conversions preserve
-	 * the original value over the entire range.
-	 */
-	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+	unsigned long weight;
+
+	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
 		return -ERANGE;
 
-	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+	weight = sched_weight_from_cgroup(cgrp_weight);
 
 	return sched_group_set_shares(css_tg(css), scale_load(weight));
 }
@@ -11175,7 +11173,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
 				    struct cftype *cft)
 {
-	unsigned long weight = scale_load_down(css_tg(css)->shares);
+	unsigned long weight = tg_weight(css_tg(css));
 	int last_delta = INT_MAX;
 	int prio, delta;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1545779c5db87..9a6cba6f9299b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -232,6 +232,24 @@ static inline void update_avg(u64 *avg, u64 sample)
 #define shr_bound(val, shift)							\
 	(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
 
+/*
+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
+ * maps pretty well onto the shares value used by scheduler and the round-trip
+ * conversions preserve the original value over the entire range.
+ */
+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
+{
+	return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
+}
+
+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
+{
+	return clamp_t(unsigned long,
+		       DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
+		       CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
 /*
  * !! For sched_setattr_nocheck() (kernel) only !!
  *

From 82e6cbdbf58af30a590de1ad041f0dd90795679f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 008/304] sched: Expose css_tg(), __setscheduler_prio() and
 SCHED_CHANGE_BLOCK()

These will be used by a new BPF extensible sched_class.

css_tg() will be used in the init and exit paths to visit all task_groups by
walking cgroups.

__setscheduler_prio() is used to pick the sched_class matching the current
prio of the task. For the new BPF extensible sched_class, the mapping from
the task configuration to sched_class isn't static and depends on a few
factors - e.g. whether the BPF progs implementing the scheduler are loaded
and in a serviceable state. That mapping logic will be added to
__setscheduler_prio().

When the BPF scheduler progs get loaded and unloaded, the mapping changes
and the new sched_class will walk the tasks applying the new mapping using
SCHED_CHANGE_BLOCK() and __setscheduler_prio().

v2: Expose SCHED_CHANGE_BLOCK() too and update the description.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
---
 kernel/sched/core.c  | 47 +++----------------------------------------
 kernel/sched/sched.h | 48 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fc70080952491..2a602f93f5f8b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2096,15 +2096,7 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_task(rq, p, flags);
 }
 
-struct sched_change_guard {
-	struct task_struct	*p;
-	struct rq		*rq;
-	bool			queued;
-	bool			running;
-	bool			done;
-};
-
-static struct sched_change_guard
+struct sched_change_guard
 sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_change_guard cg = {
@@ -2129,7 +2121,7 @@ sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags)
 	return cg;
 }
 
-static void sched_change_guard_fini(struct sched_change_guard *cg, int flags)
+void sched_change_guard_fini(struct sched_change_guard *cg, int flags)
 {
 	if (cg->queued)
 		enqueue_task(cg->rq, cg->p, flags | ENQUEUE_NOCLOCK);
@@ -2138,34 +2130,6 @@ static void sched_change_guard_fini(struct sched_change_guard *cg, int flags)
 	cg->done = true;
 }
 
-/**
- * SCHED_CHANGE_BLOCK - Nested block for task attribute updates
- * @__rq: Runqueue the target task belongs to
- * @__p: Target task
- * @__flags: DEQUEUE/ENQUEUE_* flags
- *
- * A task may need to be dequeued and put_prev_task'd for attribute updates and
- * set_next_task'd and re-enqueued afterwards. This helper defines a nested
- * block which automatically handles these preparation and cleanup operations.
- *
- *  SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
- *	  update_attribute(p);
- *        ...
- *  }
- *
- * If @__flags is a variable, the variable may be updated in the block body and
- * the updated value will be used when re-enqueueing @p.
- *
- * If %DEQUEUE_NOCLOCK is specified, the caller is responsible for calling
- * update_rq_clock() beforehand. Otherwise, the rq clock is automatically
- * updated iff the task needs to be dequeued and re-enqueued. Only the former
- * case guarantees that the rq clock is up-to-date inside and after the block.
- */
-#define SCHED_CHANGE_BLOCK(__rq, __p, __flags)					\
-	for (struct sched_change_guard __cg =					\
-			sched_change_guard_init(__rq, __p, __flags);		\
-	     !__cg.done; sched_change_guard_fini(&__cg, __flags))
-
 static inline int __normal_prio(int policy, int rt_prio, int nice)
 {
 	int prio;
@@ -7016,7 +6980,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-static void __setscheduler_prio(struct task_struct *p, int prio)
+void __setscheduler_prio(struct task_struct *p, int prio)
 {
 	if (dl_prio(prio))
 		p->sched_class = &dl_sched_class;
@@ -10413,11 +10377,6 @@ void sched_move_task(struct task_struct *tsk)
 	task_rq_unlock(rq, tsk, &rf);
 }
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct task_group, css) : NULL;
-}
-
 static struct cgroup_subsys_state *
 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a6cba6f9299b..866ce69a445e8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -469,6 +469,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 	return walk_tg_tree_from(&root_task_group, down, up, data);
 }
 
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct task_group, css) : NULL;
+}
+
 extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
@@ -2386,6 +2391,8 @@ extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
+extern void __setscheduler_prio(struct task_struct *p, int prio);
+
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
@@ -2466,6 +2473,47 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
+struct sched_change_guard {
+	struct task_struct	*p;
+	struct rq		*rq;
+	bool			queued;
+	bool			running;
+	bool			done;
+};
+
+extern struct sched_change_guard
+sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags);
+
+extern void sched_change_guard_fini(struct sched_change_guard *cg, int flags);
+
+/**
+ * SCHED_CHANGE_BLOCK - Nested block for task attribute updates
+ * @__rq: Runqueue the target task belongs to
+ * @__p: Target task
+ * @__flags: DEQUEUE/ENQUEUE_* flags
+ *
+ * A task may need to be dequeued and put_prev_task'd for attribute updates and
+ * set_next_task'd and re-enqueued afterwards. This helper defines a nested
+ * block which automatically handles these preparation and cleanup operations.
+ *
+ *  SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ *	  update_attribute(p);
+ *        ...
+ *  }
+ *
+ * If @__flags is a variable, the variable may be updated in the block body and
+ * the updated value will be used when re-enqueueing @p.
+ *
+ * If %DEQUEUE_NOCLOCK is specified, the caller is responsible for calling
+ * update_rq_clock() beforehand. Otherwise, the rq clock is automatically
+ * updated iff the task needs to be dequeued and re-enqueued. Only the former
+ * case guarantees that the rq clock is up-to-date inside and after the block.
+ */
+#define SCHED_CHANGE_BLOCK(__rq, __p, __flags)					\
+	for (struct sched_change_guard __cg =					\
+			sched_change_guard_init(__rq, __p, __flags);		\
+	     !__cg.done; sched_change_guard_fini(&__cg, __flags))
+
 extern void check_class_changing(struct rq *rq, struct task_struct *p,
 				 const struct sched_class *prev_class);
 extern void check_class_changed(struct rq *rq, struct task_struct *p,

From b3f0cb90a847c3146b5f329a0a4d8acc7a6b0394 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 009/304] sched: Enumerate CPU cgroup file types

Rename cpu[_legacy]_files to cpu[_legacy]_cftypes for clarity and add
cpu_cftype_id which enumerates every cgroup2 interface file type. This
doesn't make any functional difference now. The enums will be used to access
specific cftypes by a new BPF extensible sched_class to selectively show and
hide CPU controller interface files depending on the capability of the
currently loaded BPF scheduler progs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 22 +++++++++++-----------
 kernel/sched/sched.h | 21 +++++++++++++++++++++
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a602f93f5f8b..59136fafa94c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11014,7 +11014,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 }
 #endif
 
-static struct cftype cpu_legacy_files[] = {
+static struct cftype cpu_legacy_cftypes[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -11221,21 +11221,21 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 }
 #endif
 
-static struct cftype cpu_files[] = {
+struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	{
+	[CPU_CFTYPE_WEIGHT] = {
 		.name = "weight",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = cpu_weight_read_u64,
 		.write_u64 = cpu_weight_write_u64,
 	},
-	{
+	[CPU_CFTYPE_WEIGHT_NICE] = {
 		.name = "weight.nice",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_s64 = cpu_weight_nice_read_s64,
 		.write_s64 = cpu_weight_nice_write_s64,
 	},
-	{
+	[CPU_CFTYPE_IDLE] = {
 		.name = "idle",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_s64 = cpu_idle_read_s64,
@@ -11243,13 +11243,13 @@ static struct cftype cpu_files[] = {
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	{
+	[CPU_CFTYPE_MAX] = {
 		.name = "max",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cpu_max_show,
 		.write = cpu_max_write,
 	},
-	{
+	[CPU_CFTYPE_MAX_BURST] = {
 		.name = "max.burst",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = cpu_cfs_burst_read_u64,
@@ -11257,13 +11257,13 @@ static struct cftype cpu_files[] = {
 	},
 #endif
 #ifdef CONFIG_UCLAMP_TASK_GROUP
-	{
+	[CPU_CFTYPE_UCLAMP_MIN] = {
 		.name = "uclamp.min",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cpu_uclamp_min_show,
 		.write = cpu_uclamp_min_write,
 	},
-	{
+	[CPU_CFTYPE_UCLAMP_MAX] = {
 		.name = "uclamp.max",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cpu_uclamp_max_show,
@@ -11283,8 +11283,8 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 #endif
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_legacy_files,
-	.dfl_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_cftypes,
+	.dfl_cftypes	= cpu_cftypes,
 	.early_init	= true,
 	.threaded	= true,
 };
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 866ce69a445e8..67f7f11496305 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3379,4 +3379,25 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
 #endif
 
+#ifdef CONFIG_CGROUP_SCHED
+enum cpu_cftype_id {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	CPU_CFTYPE_WEIGHT,
+	CPU_CFTYPE_WEIGHT_NICE,
+	CPU_CFTYPE_IDLE,
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	CPU_CFTYPE_MAX,
+	CPU_CFTYPE_MAX_BURST,
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	CPU_CFTYPE_UCLAMP_MIN,
+	CPU_CFTYPE_UCLAMP_MAX,
+#endif
+	CPU_CFTYPE_CNT,
+};
+
+extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
+#endif /* CONFIG_CGROUP_SCHED */
+
 #endif /* _KERNEL_SCHED_SCHED_H */

From 28dea3413815e632015101116484d74394687f8c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 010/304] sched: Add @reason to sched_class->rq_{on|off}line()

->rq_{on|off}line are called either during CPU hotplug or cpuset partition
updates. A planned BPF extensible sched_class wants to tell the BPF
scheduler progs about CPU hotplug events in a way that's synchronized with
rq state changes.

As the BPF scheduler progs aren't necessarily affected by cpuset partition
updates, we need a way to distinguish the two types of events. Let's add an
argument to tell them apart.

v2: Patch description updated to detail the expected use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c     | 12 ++++++------
 kernel/sched/deadline.c |  4 ++--
 kernel/sched/fair.c     |  4 ++--
 kernel/sched/rt.c       |  4 ++--
 kernel/sched/sched.h    | 13 +++++++++----
 kernel/sched/topology.c |  4 ++--
 6 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 59136fafa94c8..aa63371aa84c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9428,7 +9428,7 @@ static inline void balance_hotplug_wait(void)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-void set_rq_online(struct rq *rq)
+void set_rq_online(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
@@ -9438,19 +9438,19 @@ void set_rq_online(struct rq *rq)
 
 		for_each_class(class) {
 			if (class->rq_online)
-				class->rq_online(rq);
+				class->rq_online(rq, reason);
 		}
 	}
 }
 
-void set_rq_offline(struct rq *rq)
+void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 
 		for_each_class(class) {
 			if (class->rq_offline)
-				class->rq_offline(rq);
+				class->rq_offline(rq, reason);
 		}
 
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
@@ -9546,7 +9546,7 @@ int sched_cpu_activate(unsigned int cpu)
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_online(rq);
+		set_rq_online(rq, RQ_ONOFF_HOTPLUG);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 
@@ -9591,7 +9591,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 	if (rq->rd) {
 		update_rq_clock(rq);
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
+		set_rq_offline(rq, RQ_ONOFF_HOTPLUG);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 71b24371a6f77..a7bb573c4c82f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2518,7 +2518,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 }
 
 /* Assumes rq->lock is held */
-static void rq_online_dl(struct rq *rq)
+static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->dl.overloaded)
 		dl_set_overload(rq);
@@ -2529,7 +2529,7 @@ static void rq_online_dl(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void rq_offline_dl(struct rq *rq)
+static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->dl.overloaded)
 		dl_clear_overload(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 681ab0dd0bc17..28204472a3f1b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11805,14 +11805,14 @@ void trigger_load_balance(struct rq *rq)
 	nohz_balancer_kick(rq);
 }
 
-static void rq_online_fair(struct rq *rq)
+static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason)
 {
 	update_sysctl();
 
 	update_runtime_enabled(rq);
 }
 
-static void rq_offline_fair(struct rq *rq)
+static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason)
 {
 	update_sysctl();
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0a11f44adee57..2b4c769438a1b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2473,7 +2473,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 }
 
 /* Assumes rq->lock is held */
-static void rq_online_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
@@ -2484,7 +2484,7 @@ static void rq_online_rt(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void rq_offline_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 67f7f11496305..958613dd82903 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2183,6 +2183,11 @@ extern const u32		sched_prio_to_wmult[40];
 
 #define RETRY_TASK		((void *)-1UL)
 
+enum rq_onoff_reason {
+	RQ_ONOFF_HOTPLUG,		/* CPU is going on/offline */
+	RQ_ONOFF_TOPOLOGY,		/* sched domain topology update */
+};
+
 struct affinity_context {
 	const struct cpumask *new_mask;
 	struct cpumask *user_mask;
@@ -2219,8 +2224,8 @@ struct sched_class {
 
 	void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
 
-	void (*rq_online)(struct rq *rq);
-	void (*rq_offline)(struct rq *rq);
+	void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason);
+	void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason);
 
 	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
@@ -2787,8 +2792,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	raw_spin_rq_unlock(rq1);
 }
 
-extern void set_rq_online (struct rq *rq);
-extern void set_rq_offline(struct rq *rq);
+extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason);
+extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason);
 extern bool sched_smp_initialized;
 
 #else /* CONFIG_SMP */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 051aaf65c7496..155c4e7e0f085 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -495,7 +495,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		old_rd = rq->rd;
 
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
-			set_rq_offline(rq);
+			set_rq_offline(rq, RQ_ONOFF_TOPOLOGY);
 
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 
@@ -513,7 +513,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-		set_rq_online(rq);
+		set_rq_online(rq, RQ_ONOFF_TOPOLOGY);
 
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 

From a756aec6ee48bd49a35e25bd6c9df60d879821c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:05 -1000
Subject: [PATCH 011/304] sched: Add normal_policy()

A new BPF extensible sched_class will need to dynamically change how a task
picks its sched_class. For example, if the loaded BPF scheduler progs fail,
the tasks will be forced back on CFS even if the task's policy is set to the
new sched_class. To support such mapping, add normal_policy() which wraps
testing for %SCHED_NORMAL. This doesn't cause any behavior changes.

v2: Update the description with more details on the expected use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/fair.c  | 2 +-
 kernel/sched/sched.h | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28204472a3f1b..ea3788ef9686a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7806,7 +7806,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
 	 * is driven by the tick):
 	 */
-	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+	if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION))
 		return;
 
 	find_matching_se(&se, &pse);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 958613dd82903..6397843b4482d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -182,9 +182,15 @@ static inline int idle_policy(int policy)
 {
 	return policy == SCHED_IDLE;
 }
+
+static inline int normal_policy(int policy)
+{
+	return policy == SCHED_NORMAL;
+}
+
 static inline int fair_policy(int policy)
 {
-	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+	return normal_policy(policy) || policy == SCHED_BATCH;
 }
 
 static inline int rt_policy(int policy)

From 23054b8f6802136a5456895ead0fa31ea88ebbd5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 012/304] sched_ext: Add boilerplate for extensible scheduler
 class

This adds dummy implementations of sched_ext interfaces which interact with
the scheduler core and hook them in the correct places. As they're all
dummies, this doesn't cause any behavior changes. This is split out to help
reviewing.

v2: balance_scx_on_up() dropped. This will be handled in sched_ext proper.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 12 ++++++++++++
 kernel/fork.c             |  2 ++
 kernel/sched/core.c       | 32 ++++++++++++++++++++++++--------
 kernel/sched/ext.h        | 24 ++++++++++++++++++++++++
 kernel/sched/idle.c       |  2 ++
 kernel/sched/sched.h      |  2 ++
 6 files changed, 66 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/sched/ext.h
 create mode 100644 kernel/sched/ext.h

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
new file mode 100644
index 0000000000000..a05dfcf533b0c
--- /dev/null
+++ b/include/linux/sched/ext.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_EXT_H
+#define _LINUX_SCHED_EXT_H
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+#error "NOT IMPLEMENTED YET"
+#else	/* !CONFIG_SCHED_CLASS_EXT */
+
+static inline void sched_ext_free(struct task_struct *p) {}
+
+#endif	/* CONFIG_SCHED_CLASS_EXT */
+#endif	/* _LINUX_SCHED_EXT_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 0d166537a1a37..68d08701acd09 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
 #include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
@@ -843,6 +844,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
 	task_numa_free(tsk, true);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aa63371aa84c4..9ecee40eb0bcf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4682,6 +4682,8 @@ late_initcall(sched_core_sysctl_init);
  */
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
+	int ret;
+
 	__sched_fork(clone_flags, p);
 	/*
 	 * We mark the process as NEW here. This guarantees that
@@ -4718,12 +4720,16 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_reset_on_fork = 0;
 	}
 
-	if (dl_prio(p->prio))
-		return -EAGAIN;
-	else if (rt_prio(p->prio))
+	scx_pre_fork(p);
+
+	if (dl_prio(p->prio)) {
+		ret = -EAGAIN;
+		goto out_cancel;
+	} else if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
-	else
+	} else {
 		p->sched_class = &fair_sched_class;
+	}
 
 	init_entity_runnable_average(&p->se);
 
@@ -4741,6 +4747,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
 	return 0;
+
+out_cancel:
+	scx_cancel_fork(p);
+	return ret;
 }
 
 int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
@@ -4771,16 +4781,18 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-	return 0;
+	return scx_fork(p);
 }
 
 void sched_cancel_fork(struct task_struct *p)
 {
+	scx_cancel_fork(p);
 }
 
 void sched_post_fork(struct task_struct *p)
 {
 	uclamp_post_fork(p);
+	scx_post_fork(p);
 }
 
 unsigned long to_ratio(u64 period, u64 runtime)
@@ -5935,7 +5947,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
 	 * We can terminate the balance pass as soon as we know there is
 	 * a runnable task of @class priority or higher.
 	 */
-	for_class_range(class, prev->sched_class, &idle_sched_class) {
+	for_balance_class_range(class, prev->sched_class, &idle_sched_class) {
 		if (class->balance(rq, prev, rf))
 			break;
 	}
@@ -5953,6 +5965,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	const struct sched_class *class;
 	struct task_struct *p;
 
+	if (scx_enabled())
+		goto restart;
+
 	/*
 	 * Optimization: we know that if all tasks are in the fair class we can
 	 * call that function directly, but only if the @prev task wasn't of a
@@ -5978,7 +5993,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 restart:
 	put_prev_task_balance(rq, prev, rf);
 
-	for_each_class(class) {
+	for_each_active_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
@@ -6011,7 +6026,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
 	const struct sched_class *class;
 	struct task_struct *p;
 
-	for_each_class(class) {
+	for_each_active_class(class) {
 		p = class->pick_task(rq);
 		if (p)
 			return p;
@@ -9953,6 +9968,7 @@ void __init sched_init(void)
 	balance_push_set(smp_processor_id(), false);
 #endif
 	init_sched_fair_class();
+	init_sched_ext_class();
 
 	psi_init();
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
new file mode 100644
index 0000000000000..6a93c48253399
--- /dev/null
+++ b/kernel/sched/ext.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+#error "NOT IMPLEMENTED YET"
+#else	/* CONFIG_SCHED_CLASS_EXT */
+
+#define scx_enabled()		false
+
+static inline void scx_pre_fork(struct task_struct *p) {}
+static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline void scx_post_fork(struct task_struct *p) {}
+static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline void init_sched_ext_class(void) {}
+
+#define for_each_active_class		for_each_class
+#define for_balance_class_range		for_class_range
+
+#endif	/* CONFIG_SCHED_CLASS_EXT */
+
+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+#error "NOT IMPLEMENTED YET"
+#else
+static inline void scx_update_idle(struct rq *rq, bool idle) {}
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e9ef66be2870e..65378f0be8dcc 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -407,11 +407,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	scx_update_idle(rq, false);
 }
 
 static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
 {
 	update_idle_core(rq);
+	scx_update_idle(rq, true);
 	schedstat_inc(rq->sched_goidle);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6397843b4482d..6c42b042daa45 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3411,4 +3411,6 @@ enum cpu_cftype_id {
 extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
 #endif /* CONFIG_CGROUP_SCHED */
 
+#include "ext.h"
+
 #endif /* _KERNEL_SCHED_SCHED_H */

From 5052bb0c352d7a4b78c665969a533639d9773562 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 013/304] sched_ext: Implement BPF extensible scheduler class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement a new scheduler class sched_ext (SCX), which allows scheduling
policies to be implemented as BPF programs to achieve the following:

1. Ease of experimentation and exploration: Enabling rapid iteration of new
   scheduling policies.

2. Customization: Building application-specific schedulers which implement
   policies that are not applicable to general-purpose schedulers.

3. Rapid scheduler deployments: Non-disruptive swap outs of scheduling
   policies in production environments.

sched_ext leverages BPF’s struct_ops feature to define a structure which
exports function callbacks and flags to BPF programs that wish to implement
scheduling policies. The struct_ops structure exported by sched_ext is
struct sched_ext_ops, and is conceptually similar to struct sched_class. The
role of sched_ext is to map the complex sched_class callbacks to the more
simple and ergonomic struct sched_ext_ops callbacks.

For more detailed discussion on the motivations and overview, please refer
to the cover letter.

Later patches will also add several example schedulers and documentation.

This patch implements the minimum core framework to enable implementation of
BPF schedulers. Subsequent patches will gradually add functionalities
including safety guarantee mechanisms, nohz and cgroup support.

include/linux/sched/ext.h defines struct sched_ext_ops. With the comment on
top, each operation should be self-explanatory. The followings are worth
noting:

* Both "sched_ext" and its shorthand "scx" are used. If the identifier
  already has "sched" in it, "ext" is used; otherwise, "scx".

* In sched_ext_ops, only .name is mandatory. Every operation is optional and
  if omitted a simple but functional default behavior is provided.

* A new policy constant SCHED_EXT is added and a task can select sched_ext
  by invoking sched_setscheduler(2) with the new policy constant. However,
  if the BPF scheduler is not loaded, SCHED_EXT is the same as SCHED_NORMAL
  and the task is scheduled by CFS. When the BPF scheduler is loaded, all
  tasks which have the SCHED_EXT policy are switched to sched_ext.

* To bridge the workflow imbalance between the scheduler core and
  sched_ext_ops callbacks, sched_ext uses simple FIFOs called dispatch
  queues (dsq's). By default, there is one global dsq (SCX_DSQ_GLOBAL), and
  one local per-CPU dsq (SCX_DSQ_LOCAL). SCX_DSQ_GLOBAL is provided for
  convenience and need not be used by a scheduler that doesn't require it.
  SCX_DSQ_LOCAL is the per-CPU FIFO that sched_ext pulls from when putting
  the next task on the CPU. The BPF scheduler can manage an arbitrary number
  of dsq's using scx_bpf_create_dsq() and scx_bpf_destroy_dsq().

* sched_ext guarantees system integrity no matter what the BPF scheduler
  does. To enable this, each task's ownership is tracked through
  p->scx.ops_state and all tasks are put on scx_tasks list. The disable path
  can always recover and revert all tasks back to CFS. See p->scx.ops_state
  and scx_tasks.

* A task is not tied to its rq while enqueued. This decouples CPU selection
  from queueing and allows sharing a scheduling queue across an arbitrary
  subset of CPUs. This adds some complexities as a task may need to be
  bounced between rq's right before it starts executing. See
  dispatch_to_local_dsq() and move_task_to_local_dsq().

* One complication that arises from the above weak association between task
  and rq is that synchronizing with dequeue() gets complicated as dequeue()
  may happen anytime while the task is enqueued and the dispatch path might
  need to release the rq lock to transfer the task. Solving this requires a
  bit of complexity. See the logic around p->scx.sticky_cpu and
  p->scx.ops_qseq.

* Both enable and disable paths are a bit complicated. The enable path
  switches all tasks without blocking to avoid issues which can arise from
  partially switched states (e.g. the switching task itself being starved).
  The disable path can't trust the BPF scheduler at all, so it also has to
  guarantee forward progress without blocking. See scx_ops_enable() and
  scx_ops_disable_workfn().

* When sched_ext is disabled, static_branches are used to shut down the
  entry points from hot paths.

v3: * ops.set_weight() added to allow BPF schedulers to track weight changes
      without polling p->scx.weight.

    * move_task_to_local_dsq() was losing SCX-specific enq_flags when
      enqueueing the task on the target dsq because it goes through
      activate_task() which loses the upper 32bit of the flags. Carry the
      flags through rq->scx.extra_enq_flags.

    * scx_bpf_dispatch(), scx_bpf_pick_idle_cpu(), scx_bpf_task_running()
      and scx_bpf_task_cpu() now use the new KF_RCU instead of
      KF_TRUSTED_ARGS to make it easier for BPF schedulers to call them.

    * The kfunc helper access control mechanism implemented through
      sched_ext_entity.kf_mask is improved. Now SCX_CALL_OP*() is always
      used when invoking scx_ops operations.

v2: * balance_scx_on_up() is dropped. Instead, on UP, balance_scx() is
      called from put_prev_taks_scx() and pick_next_task_scx() as necessary.
      To determine whether balance_scx() should be called from
      put_prev_task_scx(), SCX_TASK_DEQD_FOR_SLEEP flag is added. See the
      comment in put_prev_task_scx() for details.

    * sched_deq_and_put_task() / sched_enq_and_set_task() sequences replaced
      with SCHED_CHANGE_BLOCK().

    * Unused all_dsqs list removed. This was a left-over from previous
      iterations.

    * p->scx.kf_mask is added to track and enforce which kfunc helpers are
      allowed. Also, init/exit sequences are updated to make some kfuncs
      always safe to call regardless of the current BPF scheduler state.
      Combined, this should make all the kfuncs safe.

    * BPF now supports sleepable struct_ops operations. Hacky workaround
      removed and operations and kfunc helpers are tagged appropriately.

    * BPF now supports bitmask / cpumask helpers. scx_bpf_get_idle_cpumask()
      and friends are added so that BPF schedulers can use the idle masks
      with the generic helpers. This replaces the hacky kfunc helpers added
      by a separate patch in V1.

    * CONFIG_SCHED_CLASS_EXT can no longer be enabled if SCHED_CORE is
      enabled. This restriction will be removed by a later patch which adds
      core-sched support.

    * Add MAINTAINERS entries and other misc changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Co-authored-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 MAINTAINERS                       |    3 +
 include/asm-generic/vmlinux.lds.h |    1 +
 include/linux/sched.h             |    5 +
 include/linux/sched/ext.h         |  401 +++-
 include/uapi/linux/sched.h        |    1 +
 init/init_task.c                  |   10 +
 kernel/Kconfig.preempt            |   22 +-
 kernel/bpf/bpf_struct_ops_types.h |    4 +
 kernel/sched/build_policy.c       |    4 +
 kernel/sched/core.c               |   26 +
 kernel/sched/debug.c              |    6 +
 kernel/sched/ext.c                | 3036 +++++++++++++++++++++++++++++
 kernel/sched/ext.h                |   97 +-
 kernel/sched/sched.h              |   16 +
 14 files changed, 3628 insertions(+), 4 deletions(-)
 create mode 100644 kernel/sched/ext.c

diff --git a/MAINTAINERS b/MAINTAINERS
index edd3d562beeeb..5273c5f895f64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18605,6 +18605,8 @@ R:	Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
 R:	Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
 R:	Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
 R:	Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
+R:	Tejun Heo <tj@kernel.org> (SCHED_EXT)
+R:	David Vernet <void@manifault.com> (SCHED_EXT)
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
@@ -18613,6 +18615,7 @@ F:	include/linux/sched.h
 F:	include/linux/wait.h
 F:	include/uapi/linux/sched.h
 F:	kernel/sched/
+F:	tools/sched_ext/
 
 SCR24X CHIP CARD INTERFACE DRIVER
 M:	Lubomir Rintel <lkundrak@v3.sk>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index d1f57e4868ed3..cd5a718ba49fc 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -131,6 +131,7 @@
 	*(__dl_sched_class)			\
 	*(__rt_sched_class)			\
 	*(__fair_sched_class)			\
+	*(__ext_sched_class)			\
 	*(__idle_sched_class)			\
 	__sched_class_lowest = .;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b11b4517760f1..d014c1681cdc6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -70,6 +70,8 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 
+#include <linux/sched/ext.h>
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
@@ -788,6 +790,9 @@ struct task_struct {
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
+#ifdef CONFIG_SCHED_CLASS_EXT
+	struct sched_ext_entity		scx;
+#endif
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index a05dfcf533b0c..45bf24a23c610 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,9 +1,408 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
 #ifndef _LINUX_SCHED_EXT_H
 #define _LINUX_SCHED_EXT_H
 
 #ifdef CONFIG_SCHED_CLASS_EXT
-#error "NOT IMPLEMENTED YET"
+
+#include <linux/rhashtable.h>
+#include <linux/llist.h>
+
+enum scx_consts {
+	SCX_OPS_NAME_LEN	= 128,
+	SCX_EXIT_REASON_LEN	= 128,
+	SCX_EXIT_BT_LEN		= 64,
+	SCX_EXIT_MSG_LEN	= 1024,
+
+	SCX_SLICE_DFL		= 20 * NSEC_PER_MSEC,
+};
+
+/*
+ * DSQ (dispatch queue) IDs are 64bit of the format:
+ *
+ *   Bits: [63] [62 ..  0]
+ *         [ B] [   ID   ]
+ *
+ *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
+ *   ID: 63 bit ID
+ *
+ * Built-in IDs:
+ *
+ *   Bits: [63] [62] [61..32] [31 ..  0]
+ *         [ 1] [ L] [   R  ] [    V   ]
+ *
+ *    1: 1 for built-in DSQs.
+ *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
+ *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
+ */
+enum scx_dsq_id_flags {
+	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
+	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
+
+	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
+	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
+	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
+	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
+	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
+};
+
+enum scx_exit_type {
+	SCX_EXIT_NONE,
+	SCX_EXIT_DONE,
+
+	SCX_EXIT_UNREG = 64,	/* BPF unregistration */
+
+	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
+	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+	/* %SCX_EXIT_* - broad category of the exit reason */
+	enum scx_exit_type	type;
+	/* textual representation of the above */
+	char			reason[SCX_EXIT_REASON_LEN];
+	/* number of entries in the backtrace */
+	u32			bt_len;
+	/* backtrace if exiting due to an error */
+	unsigned long		bt[SCX_EXIT_BT_LEN];
+	/* extra message */
+	char			msg[SCX_EXIT_MSG_LEN];
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+	/*
+	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
+	 */
+	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+	/*
+	 * By default, if there are no other task to run on the CPU, ext core
+	 * keeps running the current task even after its slice expires. If this
+	 * flag is specified, such tasks are passed to ops.enqueue() with
+	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+	 */
+	SCX_OPS_ENQ_LAST	= 1LLU << 1,
+
+	/*
+	 * An exiting task may schedule after PF_EXITING is set. In such cases,
+	 * bpf_task_from_pid() may not be able to find the task and if the BPF
+	 * scheduler depends on pid lookup for dispatching, the task will be
+	 * lost leading to various issues including RCU grace period stalls.
+	 *
+	 * To mask this problem, by default, unhashed tasks are automatically
+	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+	 * depend on pid lookups and wants to handle these tasks directly, the
+	 * following flag can be used.
+	 */
+	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
+
+	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
+				  SCX_OPS_ENQ_LAST |
+				  SCX_OPS_ENQ_EXITING,
+};
+
+/* argument container for ops.enable() and friends */
+struct scx_enable_args {
+	/* empty for now */
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * Userland can implement an arbitrary scheduling policy by implementing and
+ * loading operations in this table.
+ */
+struct sched_ext_ops {
+	/**
+	 * select_cpu - Pick the target CPU for a task which is being woken up
+	 * @p: task being woken up
+	 * @prev_cpu: the cpu @p was on before sleeping
+	 * @wake_flags: SCX_WAKE_*
+	 *
+	 * Decision made here isn't final. @p may be moved to any CPU while it
+	 * is getting dispatched for execution later. However, as @p is not on
+	 * the rq at this point, getting the eventual execution CPU right here
+	 * saves a small bit of overhead down the line.
+	 *
+	 * If an idle CPU is returned, the CPU is kicked and will try to
+	 * dispatch. While an explicit custom mechanism can be added,
+	 * select_cpu() serves as the default way to wake up idle CPUs.
+	 */
+	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+	/**
+	 * enqueue - Enqueue a task on the BPF scheduler
+	 * @p: task being enqueued
+	 * @enq_flags: %SCX_ENQ_*
+	 *
+	 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
+	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
+	 * scheduler owns @p and if it fails to dispatch @p, the task will
+	 * stall.
+	 */
+	void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+	/**
+	 * dequeue - Remove a task from the BPF scheduler
+	 * @p: task being dequeued
+	 * @deq_flags: %SCX_DEQ_*
+	 *
+	 * Remove @p from the BPF scheduler. This is usually called to isolate
+	 * the task while updating its scheduling properties (e.g. priority).
+	 *
+	 * The ext core keeps track of whether the BPF side owns a given task or
+	 * not and can gracefully ignore spurious dispatches from BPF side,
+	 * which makes it safe to not implement this method. However, depending
+	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
+	 * scheduling position not being updated across a priority change.
+	 */
+	void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+	/**
+	 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
+	 * @cpu: CPU to dispatch tasks for
+	 * @prev: previous task being switched out
+	 *
+	 * Called when a CPU's local dsq is empty. The operation should dispatch
+	 * one or more tasks from the BPF scheduler into the DSQs using
+	 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
+	 * scx_bpf_consume().
+	 *
+	 * The maximum number of times scx_bpf_dispatch() can be called without
+	 * an intervening scx_bpf_consume() is specified by
+	 * ops.dispatch_max_batch. See the comments on top of the two functions
+	 * for more details.
+	 *
+	 * When not %NULL, @prev is an SCX task with its slice depleted. If
+	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+	 * ops.dispatch() returns. To keep executing @prev, return without
+	 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
+	 */
+	void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+	/**
+	 * yield - Yield CPU
+	 * @from: yielding task
+	 * @to: optional yield target task
+	 *
+	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+	 * The BPF scheduler should ensure that other available tasks are
+	 * dispatched before the yielding task. Return value is ignored in this
+	 * case.
+	 *
+	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+	 * scheduler can implement the request, return %true; otherwise, %false.
+	 */
+	bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+	/**
+	 * set_weight - Set task weight
+	 * @p: task to set weight for
+	 * @weight: new eight [1..10000]
+	 *
+	 * Update @p's weight to @weight.
+	 */
+	void (*set_weight)(struct task_struct *p, u32 weight);
+
+	/**
+	 * set_cpumask - Set CPU affinity
+	 * @p: task to set CPU affinity for
+	 * @cpumask: cpumask of cpus that @p can run on
+	 *
+	 * Update @p's CPU affinity to @cpumask.
+	 */
+	void (*set_cpumask)(struct task_struct *p, struct cpumask *cpumask);
+
+	/**
+	 * update_idle - Update the idle state of a CPU
+	 * @cpu: CPU to udpate the idle state for
+	 * @idle: whether entering or exiting the idle state
+	 *
+	 * This operation is called when @rq's CPU goes or leaves the idle
+	 * state. By default, implementing this operation disables the built-in
+	 * idle CPU tracking and the following helpers become unavailable:
+	 *
+	 * - scx_bpf_select_cpu_dfl()
+	 * - scx_bpf_test_and_clear_cpu_idle()
+	 * - scx_bpf_pick_idle_cpu()
+	 * - scx_bpf_any_idle_cpu()
+	 *
+	 * The user also must implement ops.select_cpu() as the default
+	 * implementation relies on scx_bpf_select_cpu_dfl().
+	 *
+	 * If you keep the built-in idle tracking, specify the
+	 * %SCX_OPS_KEEP_BUILTIN_IDLE flag.
+	 */
+	void (*update_idle)(s32 cpu, bool idle);
+
+	/**
+	 * prep_enable - Prepare to enable BPF scheduling for a task
+	 * @p: task to prepare BPF scheduling for
+	 * @args: enable arguments, see the struct definition
+	 *
+	 * Either we're loading a BPF scheduler or a new task is being forked.
+	 * Prepare BPF scheduling for @p. This operation may block and can be
+	 * used for allocations.
+	 *
+	 * Return 0 for success, -errno for failure. An error return while
+	 * loading will abort loading of the BPF scheduler. During a fork, will
+	 * abort the specific fork.
+	 */
+	s32 (*prep_enable)(struct task_struct *p, struct scx_enable_args *args);
+
+	/**
+	 * enable - Enable BPF scheduling for a task
+	 * @p: task to enable BPF scheduling for
+	 * @args: enable arguments, see the struct definition
+	 *
+	 * Enable @p for BPF scheduling. @p will start running soon.
+	 */
+	void (*enable)(struct task_struct *p, struct scx_enable_args *args);
+
+	/**
+	 * cancel_enable - Cancel prep_enable()
+	 * @p: task being canceled
+	 * @args: enable arguments, see the struct definition
+	 *
+	 * @p was prep_enable()'d but failed before reaching enable(). Undo the
+	 * preparation.
+	 */
+	void (*cancel_enable)(struct task_struct *p,
+			      struct scx_enable_args *args);
+
+	/**
+	 * disable - Disable BPF scheduling for a task
+	 * @p: task to disable BPF scheduling for
+	 *
+	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+	 * Disable BPF scheduling for @p.
+	 */
+	void (*disable)(struct task_struct *p);
+
+	/*
+	 * All online ops must come before ops.init().
+	 */
+
+	/**
+	 * init - Initialize the BPF scheduler
+	 */
+	s32 (*init)(void);
+
+	/**
+	 * exit - Clean up after the BPF scheduler
+	 * @info: Exit info
+	 */
+	void (*exit)(struct scx_exit_info *info);
+
+	/**
+	 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
+	 */
+	u32 dispatch_max_batch;
+
+	/**
+	 * flags - %SCX_OPS_* flags
+	 */
+	u64 flags;
+
+	/**
+	 * name - BPF scheduler's name
+	 *
+	 * Must be a non-zero valid BPF object name including only isalnum(),
+	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+	 * BPF scheduler is enabled.
+	 */
+	char name[SCX_OPS_NAME_LEN];
+};
+
+/*
+ * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
+ * scheduler core and the BPF scheduler. See the documentation for more details.
+ */
+struct scx_dispatch_q {
+	raw_spinlock_t		lock;
+	struct list_head	fifo;	/* processed in dispatching order */
+	u32			nr;
+	u64			id;
+	struct rhash_head	hash_node;
+	struct llist_node	free_node;
+	struct rcu_head		rcu;
+};
+
+/* scx_entity.flags */
+enum scx_ent_flags {
+	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
+	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
+	SCX_TASK_ENQ_LOCAL	= 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
+
+	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
+	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
+
+	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 17, /* last dequeue was for SLEEP */
+
+	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
+};
+
+/*
+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
+ * everywhere and the following bits track which kfunc sets are currently
+ * allowed for %current. This simple per-task tracking works because SCX ops
+ * nest in a limited way. BPF will likely implement a way to allow and disallow
+ * kfuncs depending on the calling context which will replace this manual
+ * mechanism. See scx_kf_allow().
+ */
+enum scx_kf_mask {
+	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
+	/* all non-sleepables may be nested inside INIT and SLEEPABLE */
+	SCX_KF_INIT		= 1 << 0, /* running ops.init() */
+	SCX_KF_SLEEPABLE	= 1 << 1, /* other sleepable init operations */
+	/* ops.dequeue (in REST) may be nested inside DISPATCH */
+	SCX_KF_DISPATCH		= 1 << 3, /* ops.dispatch() */
+	SCX_KF_ENQUEUE		= 1 << 4, /* ops.enqueue() */
+	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
+
+	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_REST,
+};
+
+/*
+ * The following is embedded in task_struct and contains all fields necessary
+ * for a task to be scheduled by SCX.
+ */
+struct sched_ext_entity {
+	struct scx_dispatch_q	*dsq;
+	struct list_head	dsq_node;
+	u32			flags;		/* protected by rq lock */
+	u32			weight;
+	s32			sticky_cpu;
+	s32			holding_cpu;
+	u32			kf_mask;	/* see scx_kf_mask above */
+	atomic64_t		ops_state;
+
+	/* BPF scheduler modifiable fields */
+
+	/*
+	 * Runtime budget in nsecs. This is usually set through
+	 * scx_bpf_dispatch() but can also be modified directly by the BPF
+	 * scheduler. Automatically decreased by SCX as the task executes. On
+	 * depletion, a scheduling event is triggered.
+	 */
+	u64			slice;
+
+	/* cold fields */
+	struct list_head	tasks_node;
+};
+
+void sched_ext_free(struct task_struct *p);
+
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab26..359a14cc76a40 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -118,6 +118,7 @@ struct clone_args {
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
 #define SCHED_DEADLINE		6
+#define SCHED_EXT		7
 
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b1..bdbc663107bfc 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -6,6 +6,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/task.h>
+#include <linux/sched/ext.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -101,6 +102,15 @@ struct task_struct init_task
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 	.sched_task_group = &root_task_group,
+#endif
+#ifdef CONFIG_SCHED_CLASS_EXT
+	.scx		= {
+		.dsq_node	= LIST_HEAD_INIT(init_task.scx.dsq_node),
+		.sticky_cpu	= -1,
+		.holding_cpu	= -1,
+		.ops_state	= ATOMIC_INIT(0),
+		.slice		= SCX_SLICE_DFL,
+	},
 #endif
 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
 	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a8214..0afcda19bc50c 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -133,4 +133,24 @@ config SCHED_CORE
 	  which is the likely usage by Linux distributions, there should
 	  be no measurable impact on performance.
 
-
+config SCHED_CLASS_EXT
+	bool "Extensible Scheduling Class"
+	depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
+	help
+	  This option enables a new scheduler class sched_ext (SCX), which
+	  allows scheduling policies to be implemented as BPF programs to
+	  achieve the following:
+
+	  - Ease of experimentation and exploration: Enabling rapid
+	    iteration of new scheduling policies.
+	  - Customization: Building application-specific schedulers which
+	    implement policies that are not applicable to general-purpose
+	    schedulers.
+	  - Rapid scheduler deployments: Non-disruptive swap outs of
+	    scheduling policies in production environments.
+
+	  sched_ext leverages BPF’s struct_ops feature to define a structure
+	  which exports function callbacks and flags to BPF programs that
+	  wish to implement scheduling policies. The struct_ops structure
+	  exported by sched_ext is struct sched_ext_ops, and is conceptually
+	  similar to struct sched_class.
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
index 5678a9ddf8178..3618769d853d0 100644
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -9,4 +9,8 @@ BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
 #include <net/tcp.h>
 BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
 #endif
+#ifdef CONFIG_SCHED_CLASS_EXT
+#include <linux/sched/ext.h>
+BPF_STRUCT_OPS_TYPE(sched_ext_ops)
+#endif
 #endif
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index d9dc9ab3773f2..4c658b21f603c 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tsacct_kern.h>
 #include <linux/vtime.h>
+#include <linux/percpu-rwsem.h>
 
 #include <uapi/linux/sched/types.h>
 
@@ -52,3 +53,6 @@
 #include "cputime.c"
 #include "deadline.c"
 
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext.c"
+#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9ecee40eb0bcf..a3fb6a05d1313 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4480,6 +4480,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->rt.on_rq		= 0;
 	p->rt.on_list		= 0;
 
+#ifdef CONFIG_SCHED_CLASS_EXT
+	p->scx.dsq		= NULL;
+	INIT_LIST_HEAD(&p->scx.dsq_node);
+	p->scx.flags		= 0;
+	p->scx.weight		= 0;
+	p->scx.sticky_cpu	= -1;
+	p->scx.holding_cpu	= -1;
+	p->scx.kf_mask		= 0;
+	atomic64_set(&p->scx.ops_state, 0);
+	p->scx.slice		= SCX_SLICE_DFL;
+#endif
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
@@ -4727,6 +4739,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		goto out_cancel;
 	} else if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+	} else if (task_on_scx(p)) {
+		p->sched_class = &ext_sched_class;
+#endif
 	} else {
 		p->sched_class = &fair_sched_class;
 	}
@@ -7001,6 +7017,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
 		p->sched_class = &dl_sched_class;
 	else if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+	else if (task_on_scx(p))
+		p->sched_class = &ext_sched_class;
+#endif
 	else
 		p->sched_class = &fair_sched_class;
 
@@ -8927,6 +8947,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
+	case SCHED_EXT:
 		ret = 0;
 		break;
 	}
@@ -8954,6 +8975,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
+	case SCHED_EXT:
 		ret = 0;
 	}
 	return ret;
@@ -9799,6 +9821,10 @@ void __init sched_init(void)
 	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
 	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
 	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
+#ifdef CONFIG_SCHED_CLASS_EXT
+	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
+	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+#endif
 
 	wait_bit_init();
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07ac..814ed80b8ff64 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -338,6 +338,9 @@ static __init int sched_init_debug(void)
 
 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
 
+#ifdef CONFIG_SCHED_CLASS_EXT
+	debugfs_create_file("ext", 0444, debugfs_sched, NULL, &sched_ext_fops);
+#endif
 	return 0;
 }
 late_initcall(sched_init_debug);
@@ -1047,6 +1050,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P(dl.runtime);
 		P(dl.deadline);
 	}
+#ifdef CONFIG_SCHED_CLASS_EXT
+	__PS("ext.enabled", p->sched_class == &ext_sched_class);
+#endif
 #undef PN_SCHEDSTAT
 #undef P_SCHEDSTAT
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
index 0000000000000..8e778d8ec59c9
--- /dev/null
+++ b/kernel/sched/ext.c
@@ -0,0 +1,3036 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_internal_consts {
+	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
+	SCX_DSP_DFL_MAX_BATCH	= 32,
+};
+
+enum scx_ops_enable_state {
+	SCX_OPS_PREPPING,
+	SCX_OPS_ENABLING,
+	SCX_OPS_ENABLED,
+	SCX_OPS_DISABLING,
+	SCX_OPS_DISABLED,
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ *   ^              |                 |
+ *   |              v                 v
+ *   \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+	SCX_OPSS_NONE,		/* owned by the SCX core */
+	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
+	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
+	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
+
+	/*
+	 * QSEQ brands each QUEUED instance so that, when dispatch races
+	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
+	 * on the task being dispatched.
+	 */
+	SCX_OPSS_QSEQ_SHIFT	= 2,
+	SCX_OPSS_STATE_MASK	= (1LLU << SCX_OPSS_QSEQ_SHIFT) - 1,
+	SCX_OPSS_QSEQ_MASK	= ~SCX_OPSS_STATE_MASK,
+};
+
+/*
+ * During exit, a task may schedule after losing its PIDs. When disabling the
+ * BPF scheduler, we need to be able to iterate tasks in every state to
+ * guarantee system safety. Maintain a dedicated task list which contains every
+ * task between its fork and eventual free.
+ */
+static DEFINE_SPINLOCK(scx_tasks_lock);
+static LIST_HEAD(scx_tasks);
+
+/* ops enable/disable */
+static struct kthread_worker *scx_ops_helper;
+static DEFINE_MUTEX(scx_ops_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static struct sched_ext_ops scx_ops;
+static bool scx_warned_zero_slice;
+
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
+	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
+
+static atomic_t scx_exit_type = ATOMIC_INIT(SCX_EXIT_DONE);
+static struct scx_exit_info scx_exit_info;
+
+static atomic64_t scx_nr_rejected = ATOMIC64_INIT(0);
+
+/* idle tracking */
+#ifdef CONFIG_SMP
+#ifdef CONFIG_CPUMASK_OFFSTACK
+#define CL_ALIGNED_IF_ONSTACK
+#else
+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
+#endif
+
+static struct {
+	cpumask_var_t cpu;
+	cpumask_var_t smt;
+} idle_masks CL_ALIGNED_IF_ONSTACK;
+
+static bool __cacheline_aligned_in_smp scx_has_idle_cpus;
+#endif	/* CONFIG_SMP */
+
+/*
+ * Direct dispatch marker.
+ *
+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
+ * to indicate that direct dispatch has already happened.
+ */
+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
+
+/* dispatch queues */
+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
+
+static const struct rhashtable_params dsq_hash_params = {
+	.key_len		= 8,
+	.key_offset		= offsetof(struct scx_dispatch_q, id),
+	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
+};
+
+static struct rhashtable dsq_hash;
+static LLIST_HEAD(dsqs_to_free);
+
+/* dispatch buf */
+struct scx_dsp_buf_ent {
+	struct task_struct	*task;
+	u64			qseq;
+	u64			dsq_id;
+	u64			enq_flags;
+};
+
+static u32 scx_dsp_max_batch;
+static struct scx_dsp_buf_ent __percpu *scx_dsp_buf;
+
+struct scx_dsp_ctx {
+	struct rq		*rq;
+	struct rq_flags		*rf;
+	u32			buf_cursor;
+	u32			nr_tasks;
+};
+
+static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
+
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+		      u64 enq_flags);
+__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
+					      const char *fmt, ...);
+#define scx_ops_error(fmt, args...)						\
+	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
+
+struct scx_task_iter {
+	struct sched_ext_entity		cursor;
+	struct task_struct		*locked;
+	struct rq			*rq;
+	struct rq_flags			rf;
+};
+
+#define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+
+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
+static u32 higher_bits(u32 flags)
+{
+	return ~((1 << fls(flags)) - 1);
+}
+
+/* return the mask with only the highest bit set */
+static u32 highest_bit(u32 flags)
+{
+	int bit = fls(flags);
+	return bit ? 1 << (bit - 1) : 0;
+}
+
+/*
+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
+ * whether it's running from an allowed context.
+ *
+ * @mask is constant, always inline to cull the mask calculations.
+ */
+static __always_inline void scx_kf_allow(u32 mask)
+{
+	/* nesting is allowed only in increasing scx_kf_mask order */
+	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
+		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
+		  current->scx.kf_mask, mask);
+	current->scx.kf_mask |= mask;
+}
+
+static void scx_kf_disallow(u32 mask)
+{
+	current->scx.kf_mask &= ~mask;
+}
+
+#define SCX_CALL_OP(mask, op, args...)						\
+do {										\
+	if (mask) {								\
+		scx_kf_allow(mask);						\
+		scx_ops.op(args);						\
+		scx_kf_disallow(mask);						\
+	} else {								\
+		scx_ops.op(args);						\
+	}									\
+} while (0)
+
+#define SCX_CALL_OP_RET(mask, op, args...)					\
+({										\
+	__typeof__(scx_ops.op(args)) __ret;					\
+	if (mask) {								\
+		scx_kf_allow(mask);						\
+		__ret = scx_ops.op(args);					\
+		scx_kf_disallow(mask);						\
+	} else {								\
+		__ret = scx_ops.op(args);					\
+	}									\
+	__ret;									\
+})
+
+/* @mask is constant, always inline to cull unnecessary branches */
+static __always_inline bool scx_kf_allowed(u32 mask)
+{
+	if (unlikely(!(current->scx.kf_mask & mask))) {
+		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+			      mask, current->scx.kf_mask);
+		return false;
+	}
+
+	if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) &&
+		     in_interrupt())) {
+		scx_ops_error("sleepable kfunc called from non-sleepable context");
+		return false;
+	}
+
+	/*
+	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
+	 * DISPATCH must not be called if we're running DEQUEUE which is nested
+	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
+	 * boundary thanks to the above in_interrupt() check.
+	 */
+	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
+		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
+		scx_ops_error("dispatch kfunc called from a nested operation");
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * scx_task_iter_init - Initialize a task iterator
+ * @iter: iterator to init
+ *
+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
+ * @iter must eventually be exited with scx_task_iter_exit().
+ *
+ * scx_tasks_lock may be released between this and the first next() call or
+ * between any two next() calls. If scx_tasks_lock is released between two
+ * next() calls, the caller is responsible for ensuring that the task being
+ * iterated remains accessible either through RCU read lock or obtaining a
+ * reference count.
+ *
+ * All tasks which existed when the iteration started are guaranteed to be
+ * visited as long as they still exist.
+ */
+static void scx_task_iter_init(struct scx_task_iter *iter)
+{
+	lockdep_assert_held(&scx_tasks_lock);
+
+	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
+	list_add(&iter->cursor.tasks_node, &scx_tasks);
+	iter->locked = NULL;
+}
+
+/**
+ * scx_task_iter_exit - Exit a task iterator
+ * @iter: iterator to exit
+ *
+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
+ * If the iterator holds a task's rq lock, that rq lock is released. See
+ * scx_task_iter_init() for details.
+ */
+static void scx_task_iter_exit(struct scx_task_iter *iter)
+{
+	struct list_head *cursor = &iter->cursor.tasks_node;
+
+	lockdep_assert_held(&scx_tasks_lock);
+
+	if (iter->locked) {
+		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+		iter->locked = NULL;
+	}
+
+	if (list_empty(cursor))
+		return;
+
+	list_del_init(cursor);
+}
+
+/**
+ * scx_task_iter_next - Next task
+ * @iter: iterator to walk
+ *
+ * Visit the next task. See scx_task_iter_init() for details.
+ */
+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
+{
+	struct list_head *cursor = &iter->cursor.tasks_node;
+	struct sched_ext_entity *pos;
+
+	lockdep_assert_held(&scx_tasks_lock);
+
+	list_for_each_entry(pos, cursor, tasks_node) {
+		if (&pos->tasks_node == &scx_tasks)
+			return NULL;
+		if (!(pos->flags & SCX_TASK_CURSOR)) {
+			list_move(cursor, &pos->tasks_node);
+			return container_of(pos, struct task_struct, scx);
+		}
+	}
+
+	/* can't happen, should always terminate at scx_tasks above */
+	BUG();
+}
+
+/**
+ * scx_task_iter_next_filtered - Next non-idle task
+ * @iter: iterator to walk
+ *
+ * Visit the next non-idle task. See scx_task_iter_init() for details.
+ */
+static struct task_struct *
+scx_task_iter_next_filtered(struct scx_task_iter *iter)
+{
+	struct task_struct *p;
+
+	while ((p = scx_task_iter_next(iter))) {
+		if (!is_idle_task(p))
+			return p;
+	}
+	return NULL;
+}
+
+/**
+ * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked
+ * @iter: iterator to walk
+ *
+ * Visit the next non-idle task with its rq lock held. See scx_task_iter_init()
+ * for details.
+ */
+static struct task_struct *
+scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
+{
+	struct task_struct *p;
+
+	if (iter->locked) {
+		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+		iter->locked = NULL;
+	}
+
+	p = scx_task_iter_next_filtered(iter);
+	if (!p)
+		return NULL;
+
+	iter->rq = task_rq_lock(p, &iter->rf);
+	iter->locked = p;
+	return p;
+}
+
+static enum scx_ops_enable_state scx_ops_enable_state(void)
+{
+	return atomic_read(&scx_ops_enable_state_var);
+}
+
+static enum scx_ops_enable_state
+scx_ops_set_enable_state(enum scx_ops_enable_state to)
+{
+	return atomic_xchg(&scx_ops_enable_state_var, to);
+}
+
+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
+					enum scx_ops_enable_state from)
+{
+	int from_v = from;
+
+	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+}
+
+static bool scx_ops_disabling(void)
+{
+	return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING);
+}
+
+/**
+ * wait_ops_state - Busy-wait the specified ops state to end
+ * @p: target task
+ * @opss: state to wait the end of
+ *
+ * Busy-wait for @p to transition out of @opss. This can only be used when the
+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
+ * has load_acquire semantics to ensure that the caller can see the updates made
+ * in the enqueueing and dispatching paths.
+ */
+static void wait_ops_state(struct task_struct *p, u64 opss)
+{
+	do {
+		cpu_relax();
+	} while (atomic64_read_acquire(&p->scx.ops_state) == opss);
+}
+
+/**
+ * ops_cpu_valid - Verify a cpu number
+ * @cpu: cpu number which came from a BPF ops
+ *
+ * @cpu is a cpu number which came from the BPF scheduler and can be any value.
+ * Verify that it is in range and one of the possible cpus.
+ */
+static bool ops_cpu_valid(s32 cpu)
+{
+	return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
+}
+
+/**
+ * ops_sanitize_err - Sanitize a -errno value
+ * @ops_name: operation to blame on failure
+ * @err: -errno value to sanitize
+ *
+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
+ * cause misbehaviors. For an example, a large negative return from
+ * ops.prep_enable() triggers an oops when passed up the call chain because the
+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
+ * handled as a pointer.
+ */
+static int ops_sanitize_err(const char *ops_name, s32 err)
+{
+	if (err < 0 && err >= -MAX_ERRNO)
+		return err;
+
+	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+	return -EPROTO;
+}
+
+static void update_curr_scx(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	u64 now = rq_clock_task(rq);
+	u64 delta_exec;
+
+	if (time_before_eq64(now, curr->se.exec_start))
+		return;
+
+	delta_exec = now - curr->se.exec_start;
+	curr->se.exec_start = now;
+	curr->se.sum_exec_runtime += delta_exec;
+	account_group_exec_runtime(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
+
+	curr->scx.slice -= min(curr->scx.slice, delta_exec);
+}
+
+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
+			     u64 enq_flags)
+{
+	bool is_local = dsq->id == SCX_DSQ_LOCAL;
+
+	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
+
+	if (!is_local) {
+		raw_spin_lock(&dsq->lock);
+		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
+			scx_ops_error("attempting to dispatch to a destroyed dsq");
+			/* fall back to the global dsq */
+			raw_spin_unlock(&dsq->lock);
+			dsq = &scx_dsq_global;
+			raw_spin_lock(&dsq->lock);
+		}
+	}
+
+	if (enq_flags & SCX_ENQ_HEAD)
+		list_add(&p->scx.dsq_node, &dsq->fifo);
+	else
+		list_add_tail(&p->scx.dsq_node, &dsq->fifo);
+	dsq->nr++;
+	p->scx.dsq = dsq;
+
+	/*
+	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
+	 * match waiters' load_acquire.
+	 */
+	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
+		atomic64_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+	if (is_local) {
+		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+
+		if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
+			resched_curr(rq);
+	} else {
+		raw_spin_unlock(&dsq->lock);
+	}
+}
+
+static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
+{
+	struct scx_dispatch_q *dsq = p->scx.dsq;
+	bool is_local = dsq == &scx_rq->local_dsq;
+
+	if (!dsq) {
+		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+		/*
+		 * When dispatching directly from the BPF scheduler to a local
+		 * DSQ, the task isn't associated with any DSQ but
+		 * @p->scx.holding_cpu may be set under the protection of
+		 * %SCX_OPSS_DISPATCHING.
+		 */
+		if (p->scx.holding_cpu >= 0)
+			p->scx.holding_cpu = -1;
+		return;
+	}
+
+	if (!is_local)
+		raw_spin_lock(&dsq->lock);
+
+	/*
+	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
+	 * can't change underneath us.
+	*/
+	if (p->scx.holding_cpu < 0) {
+		/* @p must still be on @dsq, dequeue */
+		WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
+		list_del_init(&p->scx.dsq_node);
+		dsq->nr--;
+	} else {
+		/*
+		 * We're racing against dispatch_to_local_dsq() which already
+		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
+		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
+		 * the race.
+		 */
+		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+		p->scx.holding_cpu = -1;
+	}
+	p->scx.dsq = NULL;
+
+	if (!is_local)
+		raw_spin_unlock(&dsq->lock);
+}
+
+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
+{
+	lockdep_assert(rcu_read_lock_any_held());
+
+	if (dsq_id == SCX_DSQ_GLOBAL)
+		return &scx_dsq_global;
+	else
+		return rhashtable_lookup_fast(&dsq_hash, &dsq_id,
+					      dsq_hash_params);
+}
+
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+						    struct task_struct *p)
+{
+	struct scx_dispatch_q *dsq;
+
+	if (dsq_id == SCX_DSQ_LOCAL)
+		return &rq->scx.local_dsq;
+
+	dsq = find_non_local_dsq(dsq_id);
+	if (unlikely(!dsq)) {
+		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
+			      dsq_id, p->comm, p->pid);
+		return &scx_dsq_global;
+	}
+
+	return dsq;
+}
+
+static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p,
+			    u64 dsq_id, u64 enq_flags)
+{
+	struct scx_dispatch_q *dsq;
+
+	/* @p must match the task which is being enqueued */
+	if (unlikely(p != ddsp_task)) {
+		if (IS_ERR(ddsp_task))
+			scx_ops_error("%s[%d] already direct-dispatched",
+				      p->comm, p->pid);
+		else
+			scx_ops_error("enqueueing %s[%d] but trying to direct-dispatch %s[%d]",
+				      ddsp_task->comm, ddsp_task->pid,
+				      p->comm, p->pid);
+		return;
+	}
+
+	/*
+	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
+	 * dispatching to the local DSQ of a different CPU requires unlocking
+	 * the current rq which isn't allowed in the enqueue path. Use
+	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
+	 */
+	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
+		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
+		return;
+	}
+
+	dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
+	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+
+	/*
+	 * Mark that dispatch already happened by spoiling direct_dispatch_task
+	 * with a non-NULL value which can never match a valid task pointer.
+	 */
+	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
+}
+
+static bool test_rq_online(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return rq->online;
+#else
+	return true;
+#endif
+}
+
+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+			    int sticky_cpu)
+{
+	struct task_struct **ddsp_taskp;
+	u64 qseq;
+
+	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+
+	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
+		enq_flags |= SCX_ENQ_LOCAL;
+		p->scx.flags &= ~SCX_TASK_ENQ_LOCAL;
+	}
+
+	/* rq migration */
+	if (sticky_cpu == cpu_of(rq))
+		goto local_norefill;
+
+	/*
+	 * If !rq->online, we already told the BPF scheduler that the CPU is
+	 * offline. We're just trying to on/offline the CPU. Don't bother the
+	 * BPF scheduler.
+	 */
+	if (unlikely(!test_rq_online(rq)))
+		goto local;
+
+	/* see %SCX_OPS_ENQ_EXITING */
+	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+	    unlikely(p->flags & PF_EXITING))
+		goto local;
+
+	/* see %SCX_OPS_ENQ_LAST */
+	if (!static_branch_unlikely(&scx_ops_enq_last) &&
+	    (enq_flags & SCX_ENQ_LAST))
+		goto local;
+
+	if (!SCX_HAS_OP(enqueue)) {
+		if (enq_flags & SCX_ENQ_LOCAL)
+			goto local;
+		else
+			goto global;
+	}
+
+	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
+	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
+
+	WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+	atomic64_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
+
+	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+	WARN_ON_ONCE(*ddsp_taskp);
+	*ddsp_taskp = p;
+
+	SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+
+	/*
+	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
+	 * dequeue may be waiting. The store_release matches their load_acquire.
+	 */
+	if (*ddsp_taskp == p)
+		atomic64_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+	*ddsp_taskp = NULL;
+	return;
+
+local:
+	p->scx.slice = SCX_SLICE_DFL;
+local_norefill:
+	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+	return;
+
+global:
+	p->scx.slice = SCX_SLICE_DFL;
+	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
+}
+
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+{
+	int sticky_cpu = p->scx.sticky_cpu;
+
+	enq_flags |= rq->scx.extra_enq_flags;
+
+	if (sticky_cpu >= 0)
+		p->scx.sticky_cpu = -1;
+
+	/*
+	 * Restoring a running task will be immediately followed by
+	 * set_next_task_scx() which expects the task to not be on the BPF
+	 * scheduler as tasks can only start running through local DSQs. Force
+	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
+	 */
+	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
+		sticky_cpu = cpu_of(rq);
+
+	if (p->scx.flags & SCX_TASK_QUEUED)
+		return;
+
+	p->scx.flags |= SCX_TASK_QUEUED;
+	rq->scx.nr_running++;
+	add_nr_running(rq, 1);
+
+	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+}
+
+static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+{
+	u64 opss;
+
+	/* acquire ensures that we see the preceding updates on QUEUED */
+	opss = atomic64_read_acquire(&p->scx.ops_state);
+
+	switch (opss & SCX_OPSS_STATE_MASK) {
+	case SCX_OPSS_NONE:
+		break;
+	case SCX_OPSS_QUEUEING:
+		/*
+		 * QUEUEING is started and finished while holding @p's rq lock.
+		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
+		 */
+		BUG();
+	case SCX_OPSS_QUEUED:
+		if (SCX_HAS_OP(dequeue))
+			SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags);
+
+		if (atomic64_try_cmpxchg(&p->scx.ops_state, &opss,
+					 SCX_OPSS_NONE))
+			break;
+		fallthrough;
+	case SCX_OPSS_DISPATCHING:
+		/*
+		 * If @p is being dispatched from the BPF scheduler to a DSQ,
+		 * wait for the transfer to complete so that @p doesn't get
+		 * added to its DSQ after dequeueing is complete.
+		 *
+		 * As we're waiting on DISPATCHING with the rq locked, the
+		 * dispatching side shouldn't try to lock the rq while
+		 * DISPATCHING is set. See dispatch_to_local_dsq().
+		 *
+		 * DISPATCHING shouldn't have qseq set and control can reach
+		 * here with NONE @opss from the above QUEUED case block.
+		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
+		 */
+		wait_ops_state(p, SCX_OPSS_DISPATCHING);
+		BUG_ON(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		break;
+	}
+}
+
+static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+{
+	struct scx_rq *scx_rq = &rq->scx;
+
+	if (!(p->scx.flags & SCX_TASK_QUEUED))
+		return;
+
+	ops_dequeue(p, deq_flags);
+
+	if (deq_flags & SCX_DEQ_SLEEP)
+		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
+	else
+		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
+
+	p->scx.flags &= ~SCX_TASK_QUEUED;
+	scx_rq->nr_running--;
+	sub_nr_running(rq, 1);
+
+	dispatch_dequeue(scx_rq, p);
+}
+
+static void yield_task_scx(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	if (SCX_HAS_OP(yield))
+		SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL);
+	else
+		p->scx.slice = 0;
+}
+
+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
+{
+	struct task_struct *from = rq->curr;
+
+	if (SCX_HAS_OP(yield))
+		return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to);
+	else
+		return false;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
+ * @rq: rq to move the task into, currently locked
+ * @p: task to move
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller
+ * must:
+ *
+ * 1. Start with exclusive access to @p either through its DSQ lock or
+ *    %SCX_OPSS_DISPATCHING flag.
+ *
+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
+ *
+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't
+ *    deadlock with dequeue.
+ *
+ * 4. Lock @rq and the task_rq from #3.
+ *
+ * 5. Call this function.
+ *
+ * Returns %true if @p was successfully moved. %false after racing dequeue and
+ * losing.
+ */
+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
+				   u64 enq_flags)
+{
+	struct rq *task_rq;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * If dequeue got to @p while we were trying to lock both rq's, it'd
+	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
+	 * updated it to different values afterwards, as this operation can't be
+	 * preempted or recurse, @p->scx.holding_cpu can never become
+	 * raw_smp_processor_id() again before we're done. Thus, we can tell
+	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
+	 * still raw_smp_processor_id().
+	 *
+	 * See dispatch_dequeue() for the counterpart.
+	 */
+	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
+		return false;
+
+	/* @p->rq couldn't have changed if we're still the holding cpu */
+	task_rq = task_rq(p);
+	lockdep_assert_rq_held(task_rq);
+
+	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
+	deactivate_task(task_rq, p, 0);
+	set_task_cpu(p, cpu_of(rq));
+	p->scx.sticky_cpu = cpu_of(rq);
+
+	/*
+	 * We want to pass scx-specific enq_flags but activate_task() will
+	 * truncate the upper 32 bit. As we own @rq, we can pass them through
+	 * @rq->scx.extra_enq_flags instead.
+	 */
+	WARN_ON_ONCE(rq->scx.extra_enq_flags);
+	rq->scx.extra_enq_flags = enq_flags;
+	activate_task(rq, p, 0);
+	rq->scx.extra_enq_flags = 0;
+
+	return true;
+}
+
+/**
+ * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked
+ * @rq: current rq which is locked
+ * @rf: rq_flags to use when unlocking @rq
+ * @src_rq: rq to move task from
+ * @dst_rq: rq to move task to
+ *
+ * We're holding @rq lock and trying to dispatch a task from @src_rq to
+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
+ * @rq stays locked isn't important as long as the state is restored after
+ * dispatch_to_local_dsq_unlock().
+ */
+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
+				       struct rq *src_rq, struct rq *dst_rq)
+{
+	rq_unpin_lock(rq, rf);
+
+	if (src_rq == dst_rq) {
+		raw_spin_rq_unlock(rq);
+		raw_spin_rq_lock(dst_rq);
+	} else if (rq == src_rq) {
+		double_lock_balance(rq, dst_rq);
+		rq_repin_lock(rq, rf);
+	} else if (rq == dst_rq) {
+		double_lock_balance(rq, src_rq);
+		rq_repin_lock(rq, rf);
+	} else {
+		raw_spin_rq_unlock(rq);
+		double_rq_lock(src_rq, dst_rq);
+	}
+}
+
+/**
+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
+ * @rq: current rq which is locked
+ * @rf: rq_flags to use when unlocking @rq
+ * @src_rq: rq to move task from
+ * @dst_rq: rq to move task to
+ *
+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
+ */
+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
+					 struct rq *src_rq, struct rq *dst_rq)
+{
+	if (src_rq == dst_rq) {
+		raw_spin_rq_unlock(dst_rq);
+		raw_spin_rq_lock(rq);
+		rq_repin_lock(rq, rf);
+	} else if (rq == src_rq) {
+		double_unlock_balance(rq, dst_rq);
+	} else if (rq == dst_rq) {
+		double_unlock_balance(rq, src_rq);
+	} else {
+		double_rq_unlock(src_rq, dst_rq);
+		raw_spin_rq_lock(rq);
+		rq_repin_lock(rq, rf);
+	}
+}
+#endif	/* CONFIG_SMP */
+
+
+static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
+			       struct scx_dispatch_q *dsq)
+{
+	struct scx_rq *scx_rq = &rq->scx;
+	struct task_struct *p;
+	struct rq *task_rq;
+	bool moved = false;
+retry:
+	if (list_empty(&dsq->fifo))
+		return false;
+
+	raw_spin_lock(&dsq->lock);
+	list_for_each_entry(p, &dsq->fifo, scx.dsq_node) {
+		task_rq = task_rq(p);
+		if (rq == task_rq)
+			goto this_rq;
+		if (likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
+		    cpumask_test_cpu(cpu_of(rq), p->cpus_ptr))
+			goto remote_rq;
+	}
+	raw_spin_unlock(&dsq->lock);
+	return false;
+
+this_rq:
+	/* @dsq is locked and @p is on this rq */
+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+	list_move_tail(&p->scx.dsq_node, &scx_rq->local_dsq.fifo);
+	dsq->nr--;
+	scx_rq->local_dsq.nr++;
+	p->scx.dsq = &scx_rq->local_dsq;
+	raw_spin_unlock(&dsq->lock);
+	return true;
+
+remote_rq:
+#ifdef CONFIG_SMP
+	/*
+	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
+	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
+	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
+	 * rq lock or fail, do a little dancing from our side. See
+	 * move_task_to_local_dsq().
+	 */
+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+	list_del_init(&p->scx.dsq_node);
+	dsq->nr--;
+	p->scx.holding_cpu = raw_smp_processor_id();
+	raw_spin_unlock(&dsq->lock);
+
+	rq_unpin_lock(rq, rf);
+	double_lock_balance(rq, task_rq);
+	rq_repin_lock(rq, rf);
+
+	moved = move_task_to_local_dsq(rq, p, 0);
+
+	double_unlock_balance(rq, task_rq);
+#endif /* CONFIG_SMP */
+	if (likely(moved))
+		return true;
+	goto retry;
+}
+
+enum dispatch_to_local_dsq_ret {
+	DTL_DISPATCHED,		/* successfully dispatched */
+	DTL_LOST,		/* lost race to dequeue */
+	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
+	DTL_INVALID,		/* invalid local dsq_id */
+};
+
+/**
+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @rq: current rq which is locked
+ * @rf: rq_flags to use when unlocking @rq
+ * @dsq_id: destination dsq ID
+ * @p: task to dispatch
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
+ * @dsq_id. This function performs all the synchronization dancing needed
+ * because local DSQs are protected with rq locks.
+ *
+ * The caller must have exclusive ownership of @p (e.g. through
+ * %SCX_OPSS_DISPATCHING).
+ */
+static enum dispatch_to_local_dsq_ret
+dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
+		      struct task_struct *p, u64 enq_flags)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dst_rq;
+
+	/*
+	 * We're synchronized against dequeue through DISPATCHING. As @p can't
+	 * be dequeued, its task_rq and cpus_allowed are stable too.
+	 */
+	if (dsq_id == SCX_DSQ_LOCAL) {
+		dst_rq = rq;
+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+		if (!ops_cpu_valid(cpu)) {
+			scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]",
+				      cpu, p->comm, p->pid);
+			return DTL_INVALID;
+		}
+		dst_rq = cpu_rq(cpu);
+	} else {
+		return DTL_NOT_LOCAL;
+	}
+
+	/* if dispatching to @rq that @p is already on, no lock dancing needed */
+	if (rq == src_rq && rq == dst_rq) {
+		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
+				 enq_flags | SCX_ENQ_CLEAR_OPSS);
+		return DTL_DISPATCHED;
+	}
+
+#ifdef CONFIG_SMP
+	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
+		struct rq *locked_dst_rq = dst_rq;
+		bool dsp;
+
+		/*
+		 * @p is on a possibly remote @src_rq which we need to lock to
+		 * move the task. If dequeue is in progress, it'd be locking
+		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
+		 * lock while holding DISPATCHING.
+		 *
+		 * As DISPATCHING guarantees that @p is wholly ours, we can
+		 * pretend that we're moving from a DSQ and use the same
+		 * mechanism - mark the task under transfer with holding_cpu,
+		 * release DISPATCHING and then follow the same protocol.
+		 */
+		p->scx.holding_cpu = raw_smp_processor_id();
+
+		/* store_release ensures that dequeue sees the above */
+		atomic64_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
+
+		/*
+		 * We don't require the BPF scheduler to avoid dispatching to
+		 * offline CPUs mostly for convenience but also because CPUs can
+		 * go offline between scx_bpf_dispatch() calls and here. If @p
+		 * is destined to an offline CPU, queue it on its current CPU
+		 * instead, which should always be safe. As this is an allowed
+		 * behavior, don't trigger an ops error.
+		 */
+		if (unlikely(!test_rq_online(dst_rq)))
+			dst_rq = src_rq;
+
+		if (src_rq == dst_rq) {
+			/*
+			 * As @p is staying on the same rq, there's no need to
+			 * go through the full deactivate/activate cycle.
+			 * Optimize by abbreviating the operations in
+			 * move_task_to_local_dsq().
+			 */
+			dsp = p->scx.holding_cpu == raw_smp_processor_id();
+			if (likely(dsp)) {
+				p->scx.holding_cpu = -1;
+				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
+						 enq_flags);
+			}
+		} else {
+			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
+		}
+
+		/* if the destination CPU is idle, wake it up */
+		if (dsp && p->sched_class > dst_rq->curr->sched_class)
+			resched_curr(dst_rq);
+
+		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
+
+		return dsp ? DTL_DISPATCHED : DTL_LOST;
+	}
+#endif /* CONFIG_SMP */
+
+	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
+		      cpu_of(dst_rq), p->comm, p->pid);
+	return DTL_INVALID;
+}
+
+/**
+ * finish_dispatch - Asynchronously finish dispatching a task
+ * @rq: current rq which is locked
+ * @rf: rq_flags to use when unlocking @rq
+ * @p: task to finish dispatching
+ * @qseq_at_dispatch: qseq when @p started getting dispatched
+ * @dsq_id: destination DSQ ID
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * Dispatching to local DSQs may need to wait for queueing to complete or
+ * require rq lock dancing. As we don't wanna do either while inside
+ * ops.dispatch() to avoid locking order inversion, we split dispatching into
+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
+ * task and its qseq. Once ops.dispatch() returns, this function is called to
+ * finish up.
+ *
+ * There is no guarantee that @p is still valid for dispatching or even that it
+ * was valid in the first place. Make sure that the task is still owned by the
+ * BPF scheduler and claim the ownership before dispatching.
+ */
+static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
+			    struct task_struct *p, u64 qseq_at_dispatch,
+			    u64 dsq_id, u64 enq_flags)
+{
+	struct scx_dispatch_q *dsq;
+	u64 opss;
+
+retry:
+	/*
+	 * No need for _acquire here. @p is accessed only after a successful
+	 * try_cmpxchg to DISPATCHING.
+	 */
+	opss = atomic64_read(&p->scx.ops_state);
+
+	switch (opss & SCX_OPSS_STATE_MASK) {
+	case SCX_OPSS_DISPATCHING:
+	case SCX_OPSS_NONE:
+		/* someone else already got to it */
+		return;
+	case SCX_OPSS_QUEUED:
+		/*
+		 * If qseq doesn't match, @p has gone through at least one
+		 * dispatch/dequeue and re-enqueue cycle between
+		 * scx_bpf_dispatch() and here and we have no claim on it.
+		 */
+		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
+			return;
+
+		/*
+		 * While we know @p is accessible, we don't yet have a claim on
+		 * it - the BPF scheduler is allowed to dispatch tasks
+		 * spuriously and there can be a racing dequeue attempt. Let's
+		 * claim @p by atomically transitioning it from QUEUED to
+		 * DISPATCHING.
+		 */
+		if (likely(atomic64_try_cmpxchg(&p->scx.ops_state, &opss,
+						SCX_OPSS_DISPATCHING)))
+			break;
+		goto retry;
+	case SCX_OPSS_QUEUEING:
+		/*
+		 * do_enqueue_task() is in the process of transferring the task
+		 * to the BPF scheduler while holding @p's rq lock. As we aren't
+		 * holding any kernel or BPF resource that the enqueue path may
+		 * depend upon, it's safe to wait.
+		 */
+		wait_ops_state(p, opss);
+		goto retry;
+	}
+
+	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
+
+	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
+	case DTL_DISPATCHED:
+		break;
+	case DTL_LOST:
+		break;
+	case DTL_INVALID:
+		dsq_id = SCX_DSQ_GLOBAL;
+		fallthrough;
+	case DTL_NOT_LOCAL:
+		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
+					    dsq_id, p);
+		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+		break;
+	}
+}
+
+static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
+{
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	u32 u;
+
+	for (u = 0; u < dspc->buf_cursor; u++) {
+		struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u];
+
+		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
+				ent->enq_flags);
+	}
+
+	dspc->nr_tasks += dspc->buf_cursor;
+	dspc->buf_cursor = 0;
+}
+
+static int balance_scx(struct rq *rq, struct task_struct *prev,
+		       struct rq_flags *rf)
+{
+	struct scx_rq *scx_rq = &rq->scx;
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+
+	lockdep_assert_rq_held(rq);
+
+	if (prev_on_scx) {
+		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
+		update_curr_scx(rq);
+
+		/*
+		 * If @prev is runnable & has slice left, it has priority and
+		 * fetching more just increases latency for the fetched tasks.
+		 * Tell put_prev_task_scx() to put @prev on local_dsq.
+		 *
+		 * See scx_ops_disable_workfn() for the explanation on the
+		 * disabling() test.
+		 */
+		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+		    prev->scx.slice && !scx_ops_disabling()) {
+			prev->scx.flags |= SCX_TASK_BAL_KEEP;
+			return 1;
+		}
+	}
+
+	/* if there already are tasks to run, nothing to do */
+	if (scx_rq->local_dsq.nr)
+		return 1;
+
+	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
+		return 1;
+
+	if (!SCX_HAS_OP(dispatch))
+		return 0;
+
+	dspc->rq = rq;
+	dspc->rf = rf;
+
+	/*
+	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
+	 * the local DSQ might still end up empty after a successful
+	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
+	 * produced some tasks, retry. The BPF scheduler may depend on this
+	 * looping behavior to simplify its implementation.
+	 */
+	do {
+		dspc->nr_tasks = 0;
+
+		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
+			    prev_on_scx ? prev : NULL);
+
+		flush_dispatch_buf(rq, rf);
+
+		if (scx_rq->local_dsq.nr)
+			return 1;
+		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
+			return 1;
+	} while (dspc->nr_tasks);
+
+	return 0;
+}
+
+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+{
+	if (p->scx.flags & SCX_TASK_QUEUED) {
+		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		dispatch_dequeue(&rq->scx, p);
+	}
+
+	p->se.exec_start = rq_clock_task(rq);
+}
+
+static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
+{
+#ifndef CONFIG_SMP
+	/*
+	 * UP workaround.
+	 *
+	 * Because SCX may transfer tasks across CPUs during dispatch, dispatch
+	 * is performed from its balance operation which isn't called in UP.
+	 * Let's work around by calling it from the operations which come right
+	 * after.
+	 *
+	 * 1. If the prev task is on SCX, pick_next_task() calls
+	 *    .put_prev_task() right after. As .put_prev_task() is also called
+	 *    from other places, we need to distinguish the calls which can be
+	 *    done by looking at the previous task's state - if still queued or
+	 *    dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
+	 *    This case is handled here.
+	 *
+	 * 2. If the prev task is not on SCX, the first following call into SCX
+	 *    will be .pick_next_task(), which is covered by calling
+	 *    balance_scx() from pick_next_task_scx().
+	 *
+	 * Note that we can't merge the first case into the second as
+	 * balance_scx() must be called before the previous SCX task goes
+	 * through put_prev_task_scx().
+	 *
+	 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf.
+	 * Pass in %NULL.
+	 */
+	if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
+		balance_scx(rq, p, NULL);
+#endif
+
+	update_curr_scx(rq);
+
+	/*
+	 * If we're being called from put_prev_task_balance(), balance_scx() may
+	 * have decided that @p should keep running.
+	 */
+	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
+		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
+		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+		return;
+	}
+
+	if (p->scx.flags & SCX_TASK_QUEUED) {
+		/*
+		 * If @p has slice left and balance_scx() didn't tag it for
+		 * keeping, @p is getting preempted by a higher priority
+		 * scheduler class. Leave it at the head of the local DSQ.
+		 */
+		if (p->scx.slice && !scx_ops_disabling()) {
+			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+			return;
+		}
+
+		/*
+		 * If we're in the pick_next_task path, balance_scx() should
+		 * have already populated the local DSQ if there are any other
+		 * available tasks. If empty, tell ops.enqueue() that @p is the
+		 * only one available for this cpu. ops.enqueue() should put it
+		 * on the local DSQ so that the subsequent pick_next_task_scx()
+		 * can find the task unless it wants to trigger a separate
+		 * follow-up scheduling event.
+		 */
+		if (list_empty(&rq->scx.local_dsq.fifo))
+			do_enqueue_task(rq, p, SCX_ENQ_LAST | SCX_ENQ_LOCAL, -1);
+		else
+			do_enqueue_task(rq, p, 0, -1);
+	}
+}
+
+static struct task_struct *first_local_task(struct rq *rq)
+{
+	return list_first_entry_or_null(&rq->scx.local_dsq.fifo,
+					struct task_struct, scx.dsq_node);
+}
+
+static struct task_struct *pick_next_task_scx(struct rq *rq)
+{
+	struct task_struct *p;
+
+#ifndef CONFIG_SMP
+	/* UP workaround - see the comment at the head of put_prev_task_scx() */
+	if (unlikely(rq->curr->sched_class != &ext_sched_class))
+		balance_scx(rq, rq->curr, NULL);
+#endif
+
+	p = first_local_task(rq);
+	if (!p)
+		return NULL;
+
+	if (unlikely(!p->scx.slice)) {
+		if (!scx_ops_disabling() && !scx_warned_zero_slice) {
+			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+					p->comm, p->pid);
+			scx_warned_zero_slice = true;
+		}
+		p->scx.slice = SCX_SLICE_DFL;
+	}
+
+	set_next_task_scx(rq, p, true);
+
+	return p;
+}
+
+#ifdef CONFIG_SMP
+
+static bool test_and_clear_cpu_idle(int cpu)
+{
+	if (cpumask_test_and_clear_cpu(cpu, idle_masks.cpu)) {
+		if (cpumask_empty(idle_masks.cpu))
+			scx_has_idle_cpus = false;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed)
+{
+	int cpu;
+
+	do {
+		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
+		if (cpu < nr_cpu_ids) {
+			const struct cpumask *sbm = topology_sibling_cpumask(cpu);
+
+			/*
+			 * If offline, @cpu is not its own sibling and we can
+			 * get caught in an infinite loop as @cpu is never
+			 * cleared from idle_masks.smt. Clear @cpu directly in
+			 * such cases.
+			 */
+			if (likely(cpumask_test_cpu(cpu, sbm)))
+				cpumask_andnot(idle_masks.smt, idle_masks.smt, sbm);
+			else
+				cpumask_andnot(idle_masks.smt, idle_masks.smt, cpumask_of(cpu));
+		} else {
+			cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
+			if (cpu >= nr_cpu_ids)
+				return -EBUSY;
+		}
+	} while (!test_and_clear_cpu_idle(cpu));
+
+	return cpu;
+}
+
+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu;
+
+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+		scx_ops_error("built-in idle tracking is disabled");
+		return prev_cpu;
+	}
+
+	/*
+	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
+	 * local DSQ of the waker.
+	 */
+	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
+	    scx_has_idle_cpus && !(current->flags & PF_EXITING)) {
+		cpu = smp_processor_id();
+		if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
+			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
+			return cpu;
+		}
+	}
+
+	/* if the previous CPU is idle, dispatch directly to it */
+	if (test_and_clear_cpu_idle(prev_cpu)) {
+		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
+		return prev_cpu;
+	}
+
+	if (p->nr_cpus_allowed == 1)
+		return prev_cpu;
+
+	cpu = scx_pick_idle_cpu(p->cpus_ptr);
+	if (cpu >= 0) {
+		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
+		return cpu;
+	}
+
+	return prev_cpu;
+}
+
+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
+{
+	if (SCX_HAS_OP(select_cpu)) {
+		s32 cpu;
+
+		cpu = SCX_CALL_OP_RET(SCX_KF_REST, select_cpu, p, prev_cpu,
+				      wake_flags);
+		if (ops_cpu_valid(cpu)) {
+			return cpu;
+		} else {
+			scx_ops_error("select_cpu returned invalid cpu %d", cpu);
+			return prev_cpu;
+		}
+	} else {
+		return scx_select_cpu_dfl(p, prev_cpu, wake_flags);
+	}
+}
+
+static void set_cpus_allowed_scx(struct task_struct *p,
+				 struct affinity_context *ac)
+{
+	set_cpus_allowed_common(p, ac);
+
+	/*
+	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
+	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
+	 * scheduler the effective one.
+	 *
+	 * Fine-grained memory write control is enforced by BPF making the const
+	 * designation pointless. Cast it away when calling the operation.
+	 */
+	if (SCX_HAS_OP(set_cpumask))
+		SCX_CALL_OP(SCX_KF_REST, set_cpumask, p,
+			    (struct cpumask *)p->cpus_ptr);
+}
+
+static void reset_idle_masks(void)
+{
+	/* consider all cpus idle, should converge to the actual state quickly */
+	cpumask_setall(idle_masks.cpu);
+	cpumask_setall(idle_masks.smt);
+	scx_has_idle_cpus = true;
+}
+
+void __scx_update_idle(struct rq *rq, bool idle)
+{
+	int cpu = cpu_of(rq);
+	struct cpumask *sib_mask = topology_sibling_cpumask(cpu);
+
+	if (SCX_HAS_OP(update_idle)) {
+		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
+			return;
+	}
+
+	if (idle) {
+		cpumask_set_cpu(cpu, idle_masks.cpu);
+		if (!scx_has_idle_cpus)
+			scx_has_idle_cpus = true;
+
+		/*
+		 * idle_masks.smt handling is racy but that's fine as it's only
+		 * for optimization and self-correcting.
+		 */
+		for_each_cpu(cpu, sib_mask) {
+			if (!cpumask_test_cpu(cpu, idle_masks.cpu))
+				return;
+		}
+		cpumask_or(idle_masks.smt, idle_masks.smt, sib_mask);
+	} else {
+		cpumask_clear_cpu(cpu, idle_masks.cpu);
+		if (scx_has_idle_cpus && cpumask_empty(idle_masks.cpu))
+			scx_has_idle_cpus = false;
+
+		cpumask_andnot(idle_masks.smt, idle_masks.smt, sib_mask);
+	}
+}
+
+#else /* !CONFIG_SMP */
+
+static bool test_and_clear_cpu_idle(int cpu) { return false; }
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed) { return -EBUSY; }
+static void reset_idle_masks(void) {}
+
+#endif /* CONFIG_SMP */
+
+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+{
+	update_curr_scx(rq);
+
+	/*
+	 * While disabling, always resched as we can't trust the slice
+	 * management.
+	 */
+	if (scx_ops_disabling())
+		curr->scx.slice = 0;
+
+	if (!curr->scx.slice)
+		resched_curr(rq);
+}
+
+static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
+{
+	int ret;
+
+	WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED);
+
+	if (SCX_HAS_OP(prep_enable)) {
+		struct scx_enable_args args = { };
+
+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, prep_enable, p, &args);
+		if (unlikely(ret)) {
+			ret = ops_sanitize_err("prep_enable", ret);
+			return ret;
+		}
+	}
+
+	p->scx.flags |= SCX_TASK_OPS_PREPPED;
+	return 0;
+}
+
+static void scx_ops_enable_task(struct task_struct *p)
+{
+	lockdep_assert_rq_held(task_rq(p));
+	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
+
+	if (SCX_HAS_OP(enable)) {
+		struct scx_enable_args args = { };
+		SCX_CALL_OP(SCX_KF_REST, enable, p, &args);
+	}
+	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
+	p->scx.flags |= SCX_TASK_OPS_ENABLED;
+}
+
+static void scx_ops_disable_task(struct task_struct *p)
+{
+	lockdep_assert_rq_held(task_rq(p));
+
+	if (p->scx.flags & SCX_TASK_OPS_PREPPED) {
+		if (SCX_HAS_OP(cancel_enable)) {
+			struct scx_enable_args args = { };
+			SCX_CALL_OP(SCX_KF_REST, cancel_enable, p, &args);
+		}
+		p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
+	} else if (p->scx.flags & SCX_TASK_OPS_ENABLED) {
+		if (SCX_HAS_OP(disable))
+			SCX_CALL_OP(SCX_KF_REST, disable, p);
+		p->scx.flags &= ~SCX_TASK_OPS_ENABLED;
+	}
+}
+
+/**
+ * refresh_scx_weight - Refresh a task's ext weight
+ * @p: task to refresh ext weight for
+ *
+ * @p->scx.weight carries the task's static priority in cgroup weight scale to
+ * enable easy access from the BPF scheduler. To keep it synchronized with the
+ * current task priority, this function should be called when a new task is
+ * created, priority is changed for a task on sched_ext, and a task is switched
+ * to sched_ext from other classes.
+ */
+static void refresh_scx_weight(struct task_struct *p)
+{
+	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+
+	p->scx.weight = sched_weight_to_cgroup(weight);
+	if (SCX_HAS_OP(set_weight))
+		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
+}
+
+void scx_pre_fork(struct task_struct *p)
+{
+	/*
+	 * BPF scheduler enable/disable paths want to be able to iterate and
+	 * update all tasks which can become complex when racing forks. As
+	 * enable/disable are very cold paths, let's use a percpu_rwsem to
+	 * exclude forks.
+	 */
+	percpu_down_read(&scx_fork_rwsem);
+}
+
+int scx_fork(struct task_struct *p)
+{
+	percpu_rwsem_assert_held(&scx_fork_rwsem);
+
+	if (scx_enabled())
+		return scx_ops_prepare_task(p, task_group(p));
+	else
+		return 0;
+}
+
+void scx_post_fork(struct task_struct *p)
+{
+	refresh_scx_weight(p);
+
+	if (scx_enabled()) {
+		struct rq_flags rf;
+		struct rq *rq;
+
+		rq = task_rq_lock(p, &rf);
+		scx_ops_enable_task(p);
+		task_rq_unlock(rq, p, &rf);
+	}
+
+	spin_lock_irq(&scx_tasks_lock);
+	list_add_tail(&p->scx.tasks_node, &scx_tasks);
+	spin_unlock_irq(&scx_tasks_lock);
+
+	percpu_up_read(&scx_fork_rwsem);
+}
+
+void scx_cancel_fork(struct task_struct *p)
+{
+	if (scx_enabled())
+		scx_ops_disable_task(p);
+	percpu_up_read(&scx_fork_rwsem);
+}
+
+void sched_ext_free(struct task_struct *p)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&scx_tasks_lock, flags);
+	list_del_init(&p->scx.tasks_node);
+	spin_unlock_irqrestore(&scx_tasks_lock, flags);
+
+	/*
+	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s PREPPED ->
+	 * ENABLED transitions can't race us. Disable ops for @p.
+	 */
+	if (p->scx.flags & (SCX_TASK_OPS_PREPPED | SCX_TASK_OPS_ENABLED)) {
+		struct rq_flags rf;
+		struct rq *rq;
+
+		rq = task_rq_lock(p, &rf);
+		scx_ops_disable_task(p);
+		task_rq_unlock(rq, p, &rf);
+	}
+}
+
+static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
+{
+	refresh_scx_weight(p);
+}
+
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+{
+}
+
+static void switching_to_scx(struct rq *rq, struct task_struct *p)
+{
+	refresh_scx_weight(p);
+
+	/*
+	 * set_cpus_allowed_scx() is not called while @p is associated with a
+	 * different scheduler class. Keep the BPF scheduler up-to-date.
+	 */
+	if (SCX_HAS_OP(set_cpumask))
+		SCX_CALL_OP(SCX_KF_REST, set_cpumask, p,
+			    (struct cpumask *)p->cpus_ptr);
+}
+
+static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
+
+/*
+ * Omitted operations:
+ *
+ * - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the
+ *   task isn't tied to the CPU at that point.
+ *
+ * - migrate_task_rq: Unncessary as task to cpu mapping is transient.
+ *
+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
+ *   their current sched_class. Call them directly from sched core instead.
+ *
+ * - task_woken, switched_from: Unnecessary.
+ */
+DEFINE_SCHED_CLASS(ext) = {
+	.enqueue_task		= enqueue_task_scx,
+	.dequeue_task		= dequeue_task_scx,
+	.yield_task		= yield_task_scx,
+	.yield_to_task		= yield_to_task_scx,
+
+	.check_preempt_curr	= check_preempt_curr_scx,
+
+	.pick_next_task		= pick_next_task_scx,
+
+	.put_prev_task		= put_prev_task_scx,
+	.set_next_task          = set_next_task_scx,
+
+#ifdef CONFIG_SMP
+	.balance		= balance_scx,
+	.select_task_rq		= select_task_rq_scx,
+	.set_cpus_allowed	= set_cpus_allowed_scx,
+#endif
+
+	.task_tick		= task_tick_scx,
+
+	.switching_to		= switching_to_scx,
+	.switched_to		= switched_to_scx,
+	.reweight_task		= reweight_task_scx,
+	.prio_changed		= prio_changed_scx,
+
+	.update_curr		= update_curr_scx,
+
+#ifdef CONFIG_UCLAMP_TASK
+	.uclamp_enabled		= 0,
+#endif
+};
+
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+{
+	memset(dsq, 0, sizeof(*dsq));
+
+	raw_spin_lock_init(&dsq->lock);
+	INIT_LIST_HEAD(&dsq->fifo);
+	dsq->id = dsq_id;
+}
+
+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
+{
+	struct scx_dispatch_q *dsq;
+	int ret;
+
+	if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
+		return ERR_PTR(-EINVAL);
+
+	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+	if (!dsq)
+		return ERR_PTR(-ENOMEM);
+
+	init_dsq(dsq, dsq_id);
+
+	ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
+				     dsq_hash_params);
+	if (ret) {
+		kfree(dsq);
+		return ERR_PTR(ret);
+	}
+	return dsq;
+}
+
+static void free_dsq_irq_workfn(struct irq_work *irq_work)
+{
+	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
+	struct scx_dispatch_q *dsq, *tmp_dsq;
+
+	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
+		kfree_rcu(dsq);
+}
+
+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
+
+static void destroy_dsq(u64 dsq_id)
+{
+	struct scx_dispatch_q *dsq;
+	unsigned long flags;
+
+	rcu_read_lock();
+
+	dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+	if (!dsq)
+		goto out_unlock_rcu;
+
+	raw_spin_lock_irqsave(&dsq->lock, flags);
+
+	if (dsq->nr) {
+		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+			      dsq->id, dsq->nr);
+		goto out_unlock_dsq;
+	}
+
+	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+		goto out_unlock_dsq;
+
+	/*
+	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
+	 * queueing more tasks. As this function can be called from anywhere,
+	 * freeing is bounced through an irq work to avoid nesting RCU
+	 * operations inside scheduler locks.
+	 */
+	dsq->id = SCX_DSQ_INVALID;
+	llist_add(&dsq->free_node, &dsqs_to_free);
+	irq_work_queue(&free_dsq_irq_work);
+
+out_unlock_dsq:
+	raw_spin_unlock_irqrestore(&dsq->lock, flags);
+out_unlock_rcu:
+	rcu_read_unlock();
+}
+
+/*
+ * Used by sched_fork() and __setscheduler_prio() to pick the matching
+ * sched_class. dl/rt are already handled.
+ */
+bool task_on_scx(struct task_struct *p)
+{
+	if (!scx_enabled() || scx_ops_disabling())
+		return false;
+	return p->policy == SCHED_EXT;
+}
+
+static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
+{
+	if (enq_flags & SCX_ENQ_LAST)
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+	else
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
+
+static void scx_ops_disable_workfn(struct kthread_work *work)
+{
+	struct scx_exit_info *ei = &scx_exit_info;
+	struct scx_task_iter sti;
+	struct task_struct *p;
+	struct rhashtable_iter rht_iter;
+	struct scx_dispatch_q *dsq;
+	const char *reason;
+	int i, type;
+
+	type = atomic_read(&scx_exit_type);
+	while (true) {
+		/*
+		 * NONE indicates that a new scx_ops has been registered since
+		 * disable was scheduled - don't kill the new ops. DONE
+		 * indicates that the ops has already been disabled.
+		 */
+		if (type == SCX_EXIT_NONE || type == SCX_EXIT_DONE)
+			return;
+		if (atomic_try_cmpxchg(&scx_exit_type, &type, SCX_EXIT_DONE))
+			break;
+	}
+
+	switch (type) {
+	case SCX_EXIT_UNREG:
+		reason = "BPF scheduler unregistered";
+		break;
+	case SCX_EXIT_ERROR:
+		reason = "runtime error";
+		break;
+	case SCX_EXIT_ERROR_BPF:
+		reason = "scx_bpf_error";
+		break;
+	default:
+		reason = "<UNKNOWN>";
+	}
+
+	ei->type = type;
+	strlcpy(ei->reason, reason, sizeof(ei->reason));
+
+	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
+	case SCX_OPS_DISABLED:
+		pr_warn("sched_ext: ops error detected without ops (%s)\n",
+			scx_exit_info.msg);
+		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
+			     SCX_OPS_DISABLING);
+		return;
+	case SCX_OPS_PREPPING:
+		goto forward_progress_guaranteed;
+	case SCX_OPS_DISABLING:
+		/* shouldn't happen but handle it like ENABLING if it does */
+		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
+		fallthrough;
+	case SCX_OPS_ENABLING:
+	case SCX_OPS_ENABLED:
+		break;
+	}
+
+	/*
+	 * DISABLING is set and ops was either ENABLING or ENABLED indicating
+	 * that the ops and static branches are set.
+	 *
+	 * We must guarantee that all runnable tasks make forward progress
+	 * without trusting the BPF scheduler. We can't grab any mutexes or
+	 * rwsems as they might be held by tasks that the BPF scheduler is
+	 * forgetting to run, which unfortunately also excludes toggling the
+	 * static branches.
+	 *
+	 * Let's work around by overriding a couple ops and modifying behaviors
+	 * based on the DISABLING state and then cycling the tasks through
+	 * dequeue/enqueue to force global FIFO scheduling.
+	 *
+	 * a. ops.enqueue() and .dispatch() are overridden for simple global
+	 *    FIFO scheduling.
+	 *
+	 * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value
+	 *    can't be trusted. Whenever a tick triggers, the running task is
+	 *    rotated to the tail of the queue.
+	 *
+	 * c. pick_next_task() suppresses zero slice warning.
+	 */
+	scx_ops.enqueue = scx_ops_fallback_enqueue;
+	scx_ops.dispatch = scx_ops_fallback_dispatch;
+
+	spin_lock_irq(&scx_tasks_lock);
+	scx_task_iter_init(&sti);
+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
+		if (READ_ONCE(p->__state) != TASK_DEAD) {
+			SCHED_CHANGE_BLOCK(task_rq(p), p,
+					   DEQUEUE_SAVE | DEQUEUE_MOVE) {
+				/* cycling deq/enq is enough, see above */
+			}
+		}
+	}
+	scx_task_iter_exit(&sti);
+	spin_unlock_irq(&scx_tasks_lock);
+
+forward_progress_guaranteed:
+	/*
+	 * Here, every runnable task is guaranteed to make forward progress and
+	 * we can safely use blocking synchronization constructs. Actually
+	 * disable ops.
+	 */
+	mutex_lock(&scx_ops_enable_mutex);
+
+	/* avoid racing against fork */
+	cpus_read_lock();
+	percpu_down_write(&scx_fork_rwsem);
+
+	spin_lock_irq(&scx_tasks_lock);
+	scx_task_iter_init(&sti);
+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
+		const struct sched_class *old_class = p->sched_class;
+		struct rq *rq = task_rq(p);
+		bool alive = READ_ONCE(p->__state) != TASK_DEAD;
+
+		update_rq_clock(rq);
+
+		SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_MOVE |
+				   DEQUEUE_NOCLOCK) {
+			p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
+
+			__setscheduler_prio(p, p->prio);
+			if (alive)
+				check_class_changing(task_rq(p), p, old_class);
+		}
+
+		if (alive)
+			check_class_changed(task_rq(p), p, old_class, p->prio);
+
+		scx_ops_disable_task(p);
+	}
+	scx_task_iter_exit(&sti);
+	spin_unlock_irq(&scx_tasks_lock);
+
+	/* no task is on scx, turn off all the switches and flush in-progress calls */
+	static_branch_disable_cpuslocked(&__scx_ops_enabled);
+	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
+		static_branch_disable_cpuslocked(&scx_has_op[i]);
+	static_branch_disable_cpuslocked(&scx_ops_enq_last);
+	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
+	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+	synchronize_rcu();
+
+	percpu_up_write(&scx_fork_rwsem);
+	cpus_read_unlock();
+
+	if (ei->type >= SCX_EXIT_ERROR) {
+		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
+
+		if (ei->msg[0] == '\0')
+			printk(KERN_ERR "sched_ext: %s\n", ei->reason);
+		else
+			printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
+
+		stack_trace_print(ei->bt, ei->bt_len, 2);
+	}
+
+	if (scx_ops.exit)
+		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+
+	memset(&scx_ops, 0, sizeof(scx_ops));
+
+	rhashtable_walk_enter(&dsq_hash, &rht_iter);
+	do {
+		rhashtable_walk_start(&rht_iter);
+
+		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+			destroy_dsq(dsq->id);
+
+		rhashtable_walk_stop(&rht_iter);
+	} while (dsq == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&rht_iter);
+
+	free_percpu(scx_dsp_buf);
+	scx_dsp_buf = NULL;
+	scx_dsp_max_batch = 0;
+
+	mutex_unlock(&scx_ops_enable_mutex);
+
+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
+		     SCX_OPS_DISABLING);
+}
+
+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
+
+static void schedule_scx_ops_disable_work(void)
+{
+	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
+
+	/*
+	 * We may be called spuriously before the first bpf_sched_ext_reg(). If
+	 * scx_ops_helper isn't set up yet, there's nothing to do.
+	 */
+	if (helper)
+		kthread_queue_work(helper, &scx_ops_disable_work);
+}
+
+static void scx_ops_disable(enum scx_exit_type type)
+{
+	int none = SCX_EXIT_NONE;
+
+	if (WARN_ON_ONCE(type == SCX_EXIT_NONE || type == SCX_EXIT_DONE))
+		type = SCX_EXIT_ERROR;
+
+	atomic_try_cmpxchg(&scx_exit_type, &none, type);
+
+	schedule_scx_ops_disable_work();
+}
+
+static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+{
+	schedule_scx_ops_disable_work();
+}
+
+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
+
+__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
+					      const char *fmt, ...)
+{
+	struct scx_exit_info *ei = &scx_exit_info;
+	int none = SCX_EXIT_NONE;
+	va_list args;
+
+	if (!atomic_try_cmpxchg(&scx_exit_type, &none, type))
+		return;
+
+	ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1);
+
+	va_start(args, fmt);
+	vscnprintf(ei->msg, ARRAY_SIZE(ei->msg), fmt, args);
+	va_end(args);
+
+	irq_work_queue(&scx_ops_error_irq_work);
+}
+
+static struct kthread_worker *scx_create_rt_helper(const char *name)
+{
+	struct kthread_worker *helper;
+
+	helper = kthread_create_worker(0, name);
+	if (helper)
+		sched_set_fifo(helper->task);
+	return helper;
+}
+
+static int scx_ops_enable(struct sched_ext_ops *ops)
+{
+	struct scx_task_iter sti;
+	struct task_struct *p;
+	int i, ret;
+
+	mutex_lock(&scx_ops_enable_mutex);
+
+	if (!scx_ops_helper) {
+		WRITE_ONCE(scx_ops_helper,
+			   scx_create_rt_helper("sched_ext_ops_helper"));
+		if (!scx_ops_helper) {
+			ret = -ENOMEM;
+			goto err_unlock;
+		}
+	}
+
+	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+		ret = -EBUSY;
+		goto err_unlock;
+	}
+
+	/*
+	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
+	 * disable path. Failure triggers full disabling from here on.
+	 */
+	scx_ops = *ops;
+
+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
+		     SCX_OPS_DISABLED);
+
+	memset(&scx_exit_info, 0, sizeof(scx_exit_info));
+	atomic_set(&scx_exit_type, SCX_EXIT_NONE);
+	scx_warned_zero_slice = false;
+
+	atomic64_set(&scx_nr_rejected, 0);
+
+	/*
+	 * Keep CPUs stable during enable so that the BPF scheduler can track
+	 * online CPUs by watching ->on/offline_cpu() after ->init().
+	 */
+	cpus_read_lock();
+
+	if (scx_ops.init) {
+		ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
+		if (ret) {
+			ret = ops_sanitize_err("init", ret);
+			goto err_disable;
+		}
+
+		/*
+		 * Exit early if ops.init() triggered scx_bpf_error(). Not
+		 * strictly necessary as we'll fail transitioning into ENABLING
+		 * later but that'd be after calling ops.prep_enable() on all
+		 * tasks and with -EBUSY which isn't very intuitive. Let's exit
+		 * early with success so that the condition is notified through
+		 * ops.exit() like other scx_bpf_error() invocations.
+		 */
+		if (atomic_read(&scx_exit_type) != SCX_EXIT_NONE)
+			goto err_disable;
+	}
+
+	WARN_ON_ONCE(scx_dsp_buf);
+	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
+	scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch,
+				     __alignof__(scx_dsp_buf[0]));
+	if (!scx_dsp_buf) {
+		ret = -ENOMEM;
+		goto err_disable;
+	}
+
+	/*
+	 * Lock out forks before opening the floodgate so that they don't wander
+	 * into the operations prematurely.
+	 */
+	percpu_down_write(&scx_fork_rwsem);
+
+	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
+		if (((void (**)(void))ops)[i])
+			static_branch_enable_cpuslocked(&scx_has_op[i]);
+
+	if (ops->flags & SCX_OPS_ENQ_LAST)
+		static_branch_enable_cpuslocked(&scx_ops_enq_last);
+
+	if (ops->flags & SCX_OPS_ENQ_EXITING)
+		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
+
+	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
+		reset_idle_masks();
+		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
+	} else {
+		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+	}
+
+	static_branch_enable_cpuslocked(&__scx_ops_enabled);
+
+	/*
+	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
+	 * preventing new tasks from being added. No need to exclude tasks
+	 * leaving as sched_ext_free() can handle both prepped and enabled
+	 * tasks. Prep all tasks first and then enable them with preemption
+	 * disabled.
+	 */
+	spin_lock_irq(&scx_tasks_lock);
+
+	scx_task_iter_init(&sti);
+	while ((p = scx_task_iter_next_filtered(&sti))) {
+		get_task_struct(p);
+		spin_unlock_irq(&scx_tasks_lock);
+
+		ret = scx_ops_prepare_task(p, task_group(p));
+		if (ret) {
+			put_task_struct(p);
+			spin_lock_irq(&scx_tasks_lock);
+			scx_task_iter_exit(&sti);
+			spin_unlock_irq(&scx_tasks_lock);
+			pr_err("sched_ext: ops.prep_enable() failed (%d) for %s[%d] while loading\n",
+			       ret, p->comm, p->pid);
+			goto err_disable_unlock;
+		}
+
+		put_task_struct(p);
+		spin_lock_irq(&scx_tasks_lock);
+	}
+	scx_task_iter_exit(&sti);
+
+	/*
+	 * All tasks are prepped but are still ops-disabled. Ensure that
+	 * %current can't be scheduled out and switch everyone.
+	 * preempt_disable() is necessary because we can't guarantee that
+	 * %current won't be starved if scheduled out while switching.
+	 */
+	preempt_disable();
+
+	/*
+	 * From here on, the disable path must assume that tasks have ops
+	 * enabled and need to be recovered.
+	 */
+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
+		preempt_enable();
+		spin_unlock_irq(&scx_tasks_lock);
+		ret = -EBUSY;
+		goto err_disable_unlock;
+	}
+
+	/*
+	 * We're fully committed and can't fail. The PREPPED -> ENABLED
+	 * transitions here are synchronized against sched_ext_free() through
+	 * scx_tasks_lock.
+	 */
+	scx_task_iter_init(&sti);
+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
+		if (READ_ONCE(p->__state) != TASK_DEAD) {
+			const struct sched_class *old_class = p->sched_class;
+			struct rq *rq = task_rq(p);
+
+			update_rq_clock(rq);
+
+			SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_MOVE |
+					   DEQUEUE_NOCLOCK) {
+				scx_ops_enable_task(p);
+				__setscheduler_prio(p, p->prio);
+				check_class_changing(task_rq(p), p, old_class);
+			}
+
+			check_class_changed(task_rq(p), p, old_class, p->prio);
+		} else {
+			scx_ops_disable_task(p);
+		}
+	}
+	scx_task_iter_exit(&sti);
+
+	spin_unlock_irq(&scx_tasks_lock);
+	preempt_enable();
+	percpu_up_write(&scx_fork_rwsem);
+
+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
+		ret = -EBUSY;
+		goto err_disable;
+	}
+
+	cpus_read_unlock();
+	mutex_unlock(&scx_ops_enable_mutex);
+
+	return 0;
+
+err_unlock:
+	mutex_unlock(&scx_ops_enable_mutex);
+	return ret;
+
+err_disable_unlock:
+	percpu_up_write(&scx_fork_rwsem);
+err_disable:
+	cpus_read_unlock();
+	mutex_unlock(&scx_ops_enable_mutex);
+	/* must be fully disabled before returning */
+	scx_ops_disable(SCX_EXIT_ERROR);
+	kthread_flush_work(&scx_ops_disable_work);
+	return ret;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static const char *scx_ops_enable_state_str[] = {
+	[SCX_OPS_PREPPING]	= "prepping",
+	[SCX_OPS_ENABLING]	= "enabling",
+	[SCX_OPS_ENABLED]	= "enabled",
+	[SCX_OPS_DISABLING]	= "disabling",
+	[SCX_OPS_DISABLED]	= "disabled",
+};
+
+static int scx_debug_show(struct seq_file *m, void *v)
+{
+	mutex_lock(&scx_ops_enable_mutex);
+	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
+	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
+	seq_printf(m, "%-30s: %s\n", "enable_state",
+		   scx_ops_enable_state_str[scx_ops_enable_state()]);
+	seq_printf(m, "%-30s: %llu\n", "nr_rejected",
+		   atomic64_read(&scx_nr_rejected));
+	mutex_unlock(&scx_ops_enable_mutex);
+	return 0;
+}
+
+static int scx_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, scx_debug_show, NULL);
+}
+
+const struct file_operations sched_ext_fops = {
+	.open		= scx_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+/********************************************************************************
+ * bpf_struct_ops plumbing.
+ */
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+extern struct btf *btf_vmlinux;
+static const struct btf_type *task_struct_type;
+
+static bool bpf_scx_is_valid_access(int off, int size,
+				    enum bpf_access_type type,
+				    const struct bpf_prog *prog,
+				    struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+
+	return btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
+				     const struct bpf_reg_state *reg, int off,
+				     int size)
+{
+	const struct btf_type *t;
+
+	t = btf_type_by_id(reg->btf, reg->btf_id);
+	if (t == task_struct_type) {
+		if (off >= offsetof(struct task_struct, scx.slice) &&
+		    off + size <= offsetofend(struct task_struct, scx.slice))
+			return SCALAR_VALUE;
+	}
+
+	return 0;
+}
+
+static const struct bpf_func_proto *
+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_task_storage_get:
+		return &bpf_task_storage_get_proto;
+	case BPF_FUNC_task_storage_delete:
+		return &bpf_task_storage_delete_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+const struct bpf_verifier_ops bpf_scx_verifier_ops = {
+	.get_func_proto = bpf_scx_get_func_proto,
+	.is_valid_access = bpf_scx_is_valid_access,
+	.btf_struct_access = bpf_scx_btf_struct_access,
+};
+
+static int bpf_scx_init_member(const struct btf_type *t,
+			       const struct btf_member *member,
+			       void *kdata, const void *udata)
+{
+	const struct sched_ext_ops *uops = udata;
+	struct sched_ext_ops *ops = kdata;
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+	int ret;
+
+	switch (moff) {
+	case offsetof(struct sched_ext_ops, dispatch_max_batch):
+		if (*(u32 *)(udata + moff) > INT_MAX)
+			return -E2BIG;
+		ops->dispatch_max_batch = *(u32 *)(udata + moff);
+		return 1;
+	case offsetof(struct sched_ext_ops, flags):
+		if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
+			return -EINVAL;
+		ops->flags = *(u64 *)(udata + moff);
+		return 1;
+	case offsetof(struct sched_ext_ops, name):
+		ret = bpf_obj_name_cpy(ops->name, uops->name,
+				       sizeof(ops->name));
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			return -EINVAL;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int bpf_scx_check_member(const struct btf_type *t,
+				const struct btf_member *member,
+				const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct sched_ext_ops, prep_enable):
+	case offsetof(struct sched_ext_ops, init):
+	case offsetof(struct sched_ext_ops, exit):
+		break;
+	default:
+		if (prog->aux->sleepable)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int bpf_scx_reg(void *kdata)
+{
+	return scx_ops_enable(kdata);
+}
+
+static void bpf_scx_unreg(void *kdata)
+{
+	scx_ops_disable(SCX_EXIT_UNREG);
+	kthread_flush_work(&scx_ops_disable_work);
+}
+
+static int bpf_scx_init(struct btf *btf)
+{
+	u32 type_id;
+
+	type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	task_struct_type = btf_type_by_id(btf, type_id);
+
+	return 0;
+}
+
+/* "extern" to avoid sparse warning, only used in this file */
+extern struct bpf_struct_ops bpf_sched_ext_ops;
+
+struct bpf_struct_ops bpf_sched_ext_ops = {
+	.verifier_ops = &bpf_scx_verifier_ops,
+	.reg = bpf_scx_reg,
+	.unreg = bpf_scx_unreg,
+	.check_member = bpf_scx_check_member,
+	.init_member = bpf_scx_init_member,
+	.init = bpf_scx_init,
+	.name = "sched_ext_ops",
+};
+
+void __init init_sched_ext_class(void)
+{
+	int cpu;
+	u32 v;
+
+	/*
+	 * The following is to prevent the compiler from optimizing out the enum
+	 * definitions so that BPF scheduler implementations can use them
+	 * through the generated vmlinux.h.
+	 */
+	WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP);
+
+	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
+	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
+#ifdef CONFIG_SMP
+	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
+#endif
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+	}
+}
+
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+#include <linux/btf_ids.h>
+
+/* Disables missing prototype warnings for kfuncs */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+/**
+ * scx_bpf_create_dsq - Create a custom DSQ
+ * @dsq_id: DSQ to create
+ * @node: NUMA node to allocate from
+ *
+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and
+ * ops.prep_enable().
+ */
+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
+{
+	if (!scx_kf_allowed(SCX_KF_INIT | SCX_KF_SLEEPABLE))
+		return -EINVAL;
+
+	if (unlikely(node >= (int)nr_node_ids ||
+		     (node < 0 && node != NUMA_NO_NODE)))
+		return -EINVAL;
+	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+}
+
+BTF_SET8_START(scx_kfunc_ids_sleepable)
+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_SET8_END(scx_kfunc_ids_sleepable)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_sleepable,
+};
+
+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+{
+	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+		return false;
+
+	lockdep_assert_irqs_disabled();
+
+	if (unlikely(!p)) {
+		scx_ops_error("called with NULL task");
+		return false;
+	}
+
+	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
+		scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+		return false;
+	}
+
+	return true;
+}
+
+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
+{
+	struct task_struct *ddsp_task;
+	int idx;
+
+	ddsp_task = __this_cpu_read(direct_dispatch_task);
+	if (ddsp_task) {
+		direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+		return;
+	}
+
+	idx = __this_cpu_read(scx_dsp_ctx.buf_cursor);
+	if (unlikely(idx >= scx_dsp_max_batch)) {
+		scx_ops_error("dispatch buffer overflow");
+		return;
+	}
+
+	this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){
+		.task = p,
+		.qseq = atomic64_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
+		.dsq_id = dsq_id,
+		.enq_flags = enq_flags,
+	};
+	__this_cpu_inc(scx_dsp_ctx.buf_cursor);
+}
+
+/**
+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
+ * @slice: duration @p can run for in nsecs
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
+ * to call this function spuriously. Can be called from ops.enqueue() and
+ * ops.dispatch().
+ *
+ * When called from ops.enqueue(), it's for direct dispatch and @p must match
+ * the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be used to target the
+ * local DSQ of a CPU other than the enqueueing one. Use ops.select_cpu() to be
+ * on the target CPU in the first place.
+ *
+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
+ * and this function can be called upto ops.dispatch_max_batch times to dispatch
+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ *
+ * This function doesn't have any locking restrictions and may be called under
+ * BPF locks (in the future when BPF introduces more flexible locking).
+ *
+ * @p is allowed to run for @slice. The scheduling path is triggered on slice
+ * exhaustion. If zero, the current residual slice is maintained. If
+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
+ * scx_bpf_kick_cpu() to trigger scheduling.
+ */
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+		      u64 enq_flags)
+{
+	if (!scx_dispatch_preamble(p, enq_flags))
+		return;
+
+	if (slice)
+		p->scx.slice = slice;
+	else
+		p->scx.slice = p->scx.slice ?: 1;
+
+	scx_dispatch_commit(p, dsq_id, enq_flags);
+}
+
+BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
+BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_enqueue_dispatch,
+};
+
+/**
+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
+ *
+ * Can only be called from ops.dispatch().
+ */
+u32 scx_bpf_dispatch_nr_slots(void)
+{
+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
+		return 0;
+
+	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor);
+}
+
+/**
+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to consume
+ *
+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
+ * to the current CPU's local DSQ for execution. Can only be called from
+ * ops.dispatch().
+ *
+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't
+ * be called under any BPF locks.
+ *
+ * Returns %true if a task has been consumed, %false if there isn't any task to
+ * consume.
+ */
+bool scx_bpf_consume(u64 dsq_id)
+{
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	struct scx_dispatch_q *dsq;
+
+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
+		return false;
+
+	flush_dispatch_buf(dspc->rq, dspc->rf);
+
+	dsq = find_non_local_dsq(dsq_id);
+	if (unlikely(!dsq)) {
+		scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+		return false;
+	}
+
+	if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) {
+		/*
+		 * A successfully consumed task can be dequeued before it starts
+		 * running while the CPU is trying to migrate other dispatched
+		 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
+		 * local DSQ.
+		 */
+		dspc->nr_tasks++;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+BTF_SET8_START(scx_kfunc_ids_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
+BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_SET8_END(scx_kfunc_ids_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_dispatch,
+};
+
+/**
+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks
+ * @dsq_id: id of the DSQ
+ *
+ * Return the number of tasks in the DSQ matching @dsq_id. If not found,
+ * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops
+ * operations.
+ */
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+{
+	struct scx_dispatch_q *dsq;
+
+	lockdep_assert(rcu_read_lock_any_held());
+
+	if (dsq_id == SCX_DSQ_LOCAL) {
+		return this_rq()->scx.local_dsq.nr;
+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+		if (ops_cpu_valid(cpu))
+			return cpu_rq(cpu)->scx.local_dsq.nr;
+	} else {
+		dsq = find_non_local_dsq(dsq_id);
+		if (dsq)
+			return dsq->nr;
+	}
+	return -ENOENT;
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+		scx_ops_error("built-in idle tracking is disabled");
+		return false;
+	}
+
+	if (ops_cpu_valid(cpu))
+		return test_and_clear_cpu_idle(cpu);
+	else
+		return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ *
+ * Pick and claim an idle cpu which is also in @cpus_allowed. Returns the picked
+ * idle cpu number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed)
+{
+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+		scx_ops_error("built-in idle tracking is disabled");
+		return -EBUSY;
+	}
+
+	return scx_pick_idle_cpu(cpus_allowed);
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
+ */
+const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+		scx_ops_error("built-in idle tracking is disabled");
+		return cpu_none_mask;
+	}
+
+#ifdef CONFIG_SMP
+	return idle_masks.cpu;
+#else
+	return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
+ */
+const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+		scx_ops_error("built-in idle tracking is disabled");
+		return cpu_none_mask;
+	}
+
+#ifdef CONFIG_SMP
+	return idle_masks.smt;
+#else
+	return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ */
+void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+	/*
+	 * Empty function body because we aren't actually acquiring or
+	 * releasing a reference to a global idle cpumask, which is read-only
+	 * in the caller and is never released. The acquire / release semantics
+	 * here are just used to make the cpumask is a trusted pointer in the
+	 * caller.
+	 */
+}
+
+struct scx_bpf_error_bstr_bufs {
+	u64			data[MAX_BPRINTF_VARARGS];
+	char			msg[SCX_EXIT_MSG_LEN];
+};
+
+static DEFINE_PER_CPU(struct scx_bpf_error_bstr_bufs, scx_bpf_error_bstr_bufs);
+
+/**
+ * scx_bpf_error_bstr - Indicate fatal error
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops
+ * disabling.
+ */
+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz)
+{
+	struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
+	struct scx_bpf_error_bstr_bufs *bufs;
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	bufs = this_cpu_ptr(&scx_bpf_error_bstr_bufs);
+
+	if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
+	    (data__sz && !data)) {
+		scx_ops_error("invalid data=%p and data__sz=%u",
+			      (void *)data, data__sz);
+		goto out_restore;
+	}
+
+	ret = copy_from_kernel_nofault(bufs->data, data, data__sz);
+	if (ret) {
+		scx_ops_error("failed to read data fields (%d)", ret);
+		goto out_restore;
+	}
+
+	ret = bpf_bprintf_prepare(fmt, UINT_MAX, bufs->data, data__sz / 8,
+				  &bprintf_data);
+	if (ret < 0) {
+		scx_ops_error("failed to format prepration (%d)", ret);
+		goto out_restore;
+	}
+
+	ret = bstr_printf(bufs->msg, sizeof(bufs->msg), fmt,
+			  bprintf_data.bin_args);
+	bpf_bprintf_cleanup(&bprintf_data);
+	if (ret < 0) {
+		scx_ops_error("scx_ops_error(\"%s\", %p, %u) failed to format",
+			      fmt, data, data__sz);
+		goto out_restore;
+	}
+
+	scx_ops_error_type(SCX_EXIT_ERROR_BPF, "%s", bufs->msg);
+out_restore:
+	local_irq_restore(flags);
+}
+
+/**
+ * scx_bpf_destroy_dsq - Destroy a custom DSQ
+ * @dsq_id: DSQ to destroy
+ *
+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
+ * which doesn't exist. Can be called from any online scx_ops operations.
+ */
+void scx_bpf_destroy_dsq(u64 dsq_id)
+{
+	destroy_dsq(dsq_id);
+}
+
+/**
+ * scx_bpf_task_running - Is task currently running?
+ * @p: task of interest
+ */
+bool scx_bpf_task_running(const struct task_struct *p)
+{
+	return task_rq(p)->curr == p;
+}
+
+/**
+ * scx_bpf_task_cpu - CPU a task is currently associated with
+ * @p: task of interest
+ */
+s32 scx_bpf_task_cpu(const struct task_struct *p)
+{
+	return task_cpu(p);
+}
+
+BTF_SET8_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_SET8_END(scx_kfunc_ids_any)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_any,
+};
+
+__diag_pop();
+
+/*
+ * This can't be done from init_sched_ext_class() as register_btf_kfunc_id_set()
+ * needs most of the system to be up.
+ */
+static int __init register_ext_kfuncs(void)
+{
+	int ret;
+
+	/*
+	 * Some kfuncs are context-sensitive and can only be called from
+	 * specific SCX ops. They are grouped into BTF sets accordingly.
+	 * Unfortunately, BPF currently doesn't have a way of enforcing such
+	 * restrictions. Eventually, the verifier should be able to enforce
+	 * them. For now, register them the same and make each kfunc explicitly
+	 * check using scx_kf_allowed().
+	 */
+	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_sleepable)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_enqueue_dispatch)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_dispatch)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_any))) {
+		pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+__initcall(register_ext_kfuncs);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 6a93c48253399..f8d5682deacfd 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -1,7 +1,94 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+enum scx_wake_flags {
+	/* expose select WF_* flags as enums */
+	SCX_WAKE_EXEC		= WF_EXEC,
+	SCX_WAKE_FORK		= WF_FORK,
+	SCX_WAKE_TTWU		= WF_TTWU,
+	SCX_WAKE_SYNC		= WF_SYNC,
+};
+
+enum scx_enq_flags {
+	/* expose select ENQUEUE_* flags as enums */
+	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
+	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
+
+	/* high 32bits are SCX specific */
+
+	/*
+	 * The task being enqueued is the only task available for the cpu. By
+	 * default, ext core keeps executing such tasks but when
+	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with
+	 * %SCX_ENQ_LAST and %SCX_ENQ_LOCAL flags set.
+	 *
+	 * If the BPF scheduler wants to continue executing the task,
+	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
+	 * If the task gets queued on a different dsq or the BPF side, the BPF
+	 * scheduler is responsible for triggering a follow-up scheduling event.
+	 * Otherwise, Execution may stall.
+	 */
+	SCX_ENQ_LAST		= 1LLU << 41,
+
+	/*
+	 * A hint indicating that it's advisable to enqueue the task on the
+	 * local dsq of the currently selected CPU. Currently used by
+	 * select_cpu_dfl() and together with %SCX_ENQ_LAST.
+	 */
+	SCX_ENQ_LOCAL		= 1LLU << 42,
+
+	/* high 8 bits are internal */
+	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
+
+	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
+};
+
+enum scx_deq_flags {
+	/* expose select DEQUEUE_* flags as enums */
+	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
+};
 
 #ifdef CONFIG_SCHED_CLASS_EXT
-#error "NOT IMPLEMENTED YET"
+
+extern const struct sched_class ext_sched_class;
+extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
+extern const struct file_operations sched_ext_fops;
+
+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
+#define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
+
+bool task_on_scx(struct task_struct *p);
+void scx_pre_fork(struct task_struct *p);
+int scx_fork(struct task_struct *p);
+void scx_post_fork(struct task_struct *p);
+void scx_cancel_fork(struct task_struct *p);
+void init_sched_ext_class(void);
+
+static inline const struct sched_class *next_active_class(const struct sched_class *class)
+{
+	class++;
+	if (!scx_enabled() && class == &ext_sched_class)
+		class++;
+	return class;
+}
+
+#define for_active_class_range(class, _from, _to)				\
+	for (class = (_from); class != (_to); class = next_active_class(class))
+
+#define for_each_active_class(class)						\
+	for_active_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+/*
+ * SCX requires a balance() call before every pick_next_task() call including
+ * when waking up from idle.
+ */
+#define for_balance_class_range(class, prev_class, end_class)			\
+	for_active_class_range(class, (prev_class) > &ext_sched_class ?		\
+			       &ext_sched_class : (prev_class), (end_class))
+
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
@@ -18,7 +105,13 @@ static inline void init_sched_ext_class(void) {}
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 
 #if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-#error "NOT IMPLEMENTED YET"
+void __scx_update_idle(struct rq *rq, bool idle);
+
+static inline void scx_update_idle(struct rq *rq, bool idle)
+{
+	if (scx_enabled())
+		__scx_update_idle(rq, idle);
+}
 #else
 static inline void scx_update_idle(struct rq *rq, bool idle) {}
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6c42b042daa45..ae4cd306bf287 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -185,6 +185,10 @@ static inline int idle_policy(int policy)
 
 static inline int normal_policy(int policy)
 {
+#ifdef CONFIG_SCHED_CLASS_EXT
+	if (policy == SCHED_EXT)
+		return true;
+#endif
 	return policy == SCHED_NORMAL;
 }
 
@@ -681,6 +685,15 @@ struct cfs_rq {
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
 
+#ifdef CONFIG_SCHED_CLASS_EXT
+struct scx_rq {
+	struct scx_dispatch_q	local_dsq;
+	u64			ops_qseq;
+	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
+	u32			nr_running;
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
 static inline int rt_bandwidth_enabled(void)
 {
 	return sysctl_sched_rt_runtime >= 0;
@@ -1022,6 +1035,9 @@ struct rq {
 	struct cfs_rq		cfs;
 	struct rt_rq		rt;
 	struct dl_rq		dl;
+#ifdef CONFIG_SCHED_CLASS_EXT
+	struct scx_rq		scx;
+#endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */

From 247c051f997e32179770a4030026adf1902b95b2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 014/304] sched_ext: Add scx_example_simple and
 scx_example_qmap example schedulers

Add two simple example BPF schedulers - simple and qmap.

* simple: In terms of scheduling, it behaves identical to not having any
  operation implemented at all. The two operations it implements are only to
  improve visibility and exit handling. On certain homogeneous
  configurations, this actually can perform pretty well.

* qmap: A fixed five level priority scheduler to demonstrate queueing PIDs
  on BPF maps for scheduling. While not very practical, this is useful as a
  simple example and will be used to demonstrate different features.

v3: * Rename scx_example_dummy to scx_example_simple and restructure a bit
      to ease later additions. Comment updates.

    * Added declarations for BPF inline iterators. In the future, hopefully,
      these will be consolidated into a generic BPF header so that they
      don't need to be replicated here.

v2: * Updated with the generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 tools/sched_ext/.gitignore               |   5 +
 tools/sched_ext/Makefile                 | 188 +++++++++++++++
 tools/sched_ext/gnu/stubs.h              |   1 +
 tools/sched_ext/scx_common.bpf.h         | 283 +++++++++++++++++++++++
 tools/sched_ext/scx_example_qmap.bpf.c   | 241 +++++++++++++++++++
 tools/sched_ext/scx_example_qmap.c       |  84 +++++++
 tools/sched_ext/scx_example_simple.bpf.c |  56 +++++
 tools/sched_ext/scx_example_simple.c     |  93 ++++++++
 tools/sched_ext/user_exit_info.h         |  50 ++++
 9 files changed, 1001 insertions(+)
 create mode 100644 tools/sched_ext/.gitignore
 create mode 100644 tools/sched_ext/Makefile
 create mode 100644 tools/sched_ext/gnu/stubs.h
 create mode 100644 tools/sched_ext/scx_common.bpf.h
 create mode 100644 tools/sched_ext/scx_example_qmap.bpf.c
 create mode 100644 tools/sched_ext/scx_example_qmap.c
 create mode 100644 tools/sched_ext/scx_example_simple.bpf.c
 create mode 100644 tools/sched_ext/scx_example_simple.c
 create mode 100644 tools/sched_ext/user_exit_info.h

diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
new file mode 100644
index 0000000000000..2ad3d86caf797
--- /dev/null
+++ b/tools/sched_ext/.gitignore
@@ -0,0 +1,5 @@
+scx_example_simple
+scx_example_qmap
+*.skel.h
+*.subskel.h
+/tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
new file mode 100644
index 0000000000000..8f0f14bb59ff0
--- /dev/null
+++ b/tools/sched_ext/Makefile
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../build/Build.include
+include ../scripts/Makefile.arch
+include ../scripts/Makefile.include
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
+CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
+CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
+CLANG_TARGET_FLAGS_m68k         := m68k-linux-gnu
+CLANG_TARGET_FLAGS_mips         := mipsel-linux-gnu
+CLANG_TARGET_FLAGS_powerpc      := powerpc64le-linux-gnu
+CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
+CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
+CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
+CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
+
+ifeq ($(CROSS_COMPILE),)
+ifeq ($(CLANG_TARGET_FLAGS),)
+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk
+else
+CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
+endif # CLANG_TARGET_FLAGS
+else
+CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif # CROSS_COMPILE
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := $(CROSS_COMPILE)gcc
+endif # LLVM
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ..)
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(abspath ../../include/generated)
+GENHDR := $(GENDIR)/autoconf.h
+
+SCRATCH_DIR := $(CURDIR)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+INCLUDE_DIR := $(SCRATCH_DIR)/include
+BPFOBJ_DIR := $(BUILD_DIR)/libbpf
+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
+ifneq ($(CROSS_COMPILE),)
+HOST_BUILD_DIR		:= $(BUILD_DIR)/host
+HOST_SCRATCH_DIR	:= host-tools
+HOST_INCLUDE_DIR	:= $(HOST_SCRATCH_DIR)/include
+else
+HOST_BUILD_DIR		:= $(BUILD_DIR)
+HOST_SCRATCH_DIR	:= $(SCRATCH_DIR)
+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
+endif
+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
+DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
+		     ../../vmlinux						\
+		     /sys/kernel/btf/vmlinux					\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
+	  -I$(TOOLSINCDIR) -I$(APIDIR)
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz -lpthread
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
+	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)				\
+	     -I../../include							\
+	     $(call get_sys_includes,$(CLANG))					\
+	     -Wall -Wno-compare-distinct-pointer-types				\
+	     -O2 -mcpu=v3
+
+all: scx_example_simple scx_example_qmap
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+	       $(INCLUDE_DIR))
+
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
+	   $(APIDIR)/linux/bpf.h						\
+	   | $(BUILD_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/	\
+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
+		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/				\
+		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
+	| $(BPFOBJ)
+	$(call msg,CLNG-BPF,,$@)
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+%.skel.h: %.bpf.o $(BPFTOOL)
+	$(call msg,GEN-SKEL,,$@)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $(@:.skel.h=.subskel.h)
+
+scx_example_simple: scx_example_simple.c scx_example_simple.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+scx_example_qmap: scx_example_qmap.c scx_example_qmap.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+clean:
+	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
+	rm -f *.o *.bpf.o *.skel.h *.subskel.h
+	rm -f scx_example_simple scx_example_qmap
+
+.PHONY: all clean
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/tools/sched_ext/gnu/stubs.h b/tools/sched_ext/gnu/stubs.h
new file mode 100644
index 0000000000000..719225b166269
--- /dev/null
+++ b/tools/sched_ext/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
new file mode 100644
index 0000000000000..d6cb32980dfd4
--- /dev/null
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_COMMON_BPF_H
+#define __SCHED_EXT_COMMON_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <linux/errno.h>
+#include "user_exit_info.h"
+
+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
+#define PF_EXITING			0x00000004
+#define CLOCK_MONOTONIC			1
+
+/*
+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
+ * lead to really confusing misbehaviors. Let's trigger a build failure.
+ */
+static inline void ___vmlinux_h_sanity_check___(void)
+{
+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
+}
+
+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Note that __param[] must have at least one
+ * element to keep the verifier happy.
+ */
+#define scx_bpf_error(fmt, args...)						\
+({										\
+	static char ___fmt[] = fmt;						\
+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
+										\
+	_Pragma("GCC diagnostic push")						\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
+	___bpf_fill(___param, args);						\
+	_Pragma("GCC diagnostic pop")						\
+										\
+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
+										\
+	___scx_bpf_error_format_checker(fmt, ##args);				\
+})
+
+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+
+#define BPF_STRUCT_OPS(name, args...)						\
+SEC("struct_ops/"#name)								\
+BPF_PROG(name, ##args)
+
+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
+SEC("struct_ops.s/"#name)							\
+BPF_PROG(name, ##args)
+
+/**
+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
+ * @base: struct or array to index
+ * @member: dereferenced member (e.g. ->field, [idx0][idx1], ...)
+ *
+ * The verifier often gets confused by the instruction sequence the compiler
+ * generates for indexing struct fields or arrays. This macro forces the
+ * compiler to generate a code sequence which first calculates the byte offset,
+ * checks it against the struct or array size and add that byte offset to
+ * generate the pointer to the member to help the verifier.
+ *
+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
+ * BPF currently doesn't support abort, so evaluate to NULL instead. The caller
+ * must check for NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for NULL and dereference
+ * immediately.
+ *
+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
+ *	if (!vptr)
+ *		return error;
+ *	*vptr = new_value;
+ */
+#define MEMBER_VPTR(base, member) (typeof(base member) *)({			\
+	u64 __base = (u64)base;							\
+	u64 __addr = (u64)&(base member) - __base;				\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"i"(sizeof(base) - sizeof(base member)));		\
+	__addr;									\
+})
+
+/*
+ * BPF core and other generic helpers
+ */
+
+/* list and rbtree */
+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+
+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
+
+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+				      struct bpf_rb_node *node) __ksym;
+void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+		    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym;
+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+
+/* task */
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+/* cgroup */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+
+/* cpumask */
+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
+		    const struct cpumask *src2) __ksym;
+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+
+/* rcu */
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
+/* BPF core iterators from tools/testing/selftests/bpf/progs/bpf_misc.h */
+struct bpf_iter_num;
+
+extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __ksym;
+extern int *bpf_iter_num_next(struct bpf_iter_num *it) __ksym;
+extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __ksym;
+
+#ifndef bpf_for_each
+/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
+ * using BPF open-coded iterators without having to write mundane explicit
+ * low-level loop logic. Instead, it provides for()-like generic construct
+ * that can be used pretty naturally. E.g., for some hypothetical cgroup
+ * iterator, you'd write:
+ *
+ * struct cgroup *cg, *parent_cg = <...>;
+ *
+ * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) {
+ *     bpf_printk("Child cgroup id = %d", cg->cgroup_id);
+ *     if (cg->cgroup_id == 123)
+ *         break;
+ * }
+ *
+ * I.e., it looks almost like high-level for each loop in other languages,
+ * supports continue/break, and is verifiable by BPF verifier.
+ *
+ * For iterating integers, the difference betwen bpf_for_each(num, i, N, M)
+ * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to
+ * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int
+ * *`, not just `int`. So for integers bpf_for() is more convenient.
+ *
+ * Note: this macro relies on C99 feature of allowing to declare variables
+ * inside for() loop, bound to for() loop lifetime. It also utilizes GCC
+ * extension: __attribute__((cleanup(<func>))), supported by both GCC and
+ * Clang.
+ */
+#define bpf_for_each(type, cur, args...) for (							\
+	/* initialize and define destructor */							\
+	struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */,	\
+						    cleanup(bpf_iter_##type##_destroy))),	\
+	/* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */		\
+			       *___p __attribute__((unused)) = (				\
+					bpf_iter_##type##_new(&___it, ##args),			\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_##type##_destroy() when used from cleanup() attribute */		\
+					(void)bpf_iter_##type##_destroy, (void *)0);		\
+	/* iteration and termination check */							\
+	(((cur) = bpf_iter_##type##_next(&___it)));						\
+)
+#endif /* bpf_for_each */
+
+#ifndef bpf_for
+/* bpf_for(i, start, end) implements a for()-like looping construct that sets
+ * provided integer variable *i* to values starting from *start* through,
+ * but not including, *end*. It also proves to BPF verifier that *i* belongs
+ * to range [start, end), so this can be used for accessing arrays without
+ * extra checks.
+ *
+ * Note: *start* and *end* are assumed to be expressions with no side effects
+ * and whose values do not change throughout bpf_for() loop execution. They do
+ * not have to be statically known or constant, though.
+ *
+ * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
+ * loop bound variables and cleanup attribute, supported by GCC and Clang.
+ */
+#define bpf_for(i, start, end) for (								\
+	/* initialize and define destructor */							\
+	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
+						 cleanup(bpf_iter_num_destroy))),		\
+	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
+			    *___p __attribute__((unused)) = (					\
+				bpf_iter_num_new(&___it, (start), (end)),			\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
+				(void)bpf_iter_num_destroy, (void *)0);				\
+	({											\
+		/* iteration step */								\
+		int *___t = bpf_iter_num_next(&___it);						\
+		/* termination and bounds check */						\
+		(___t && ((i) = *___t, (i) >= (start) && (i) < (end)));				\
+	});											\
+)
+#endif /* bpf_for */
+
+#ifndef bpf_repeat
+/* bpf_repeat(N) performs N iterations without exposing iteration number
+ *
+ * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
+ * loop bound variables and cleanup attribute, supported by GCC and Clang.
+ */
+#define bpf_repeat(N) for (									\
+	/* initialize and define destructor */							\
+	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
+						 cleanup(bpf_iter_num_destroy))),		\
+	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
+			    *___p __attribute__((unused)) = (					\
+				bpf_iter_num_new(&___it, 0, (N)),				\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
+				(void)bpf_iter_num_destroy, (void *)0);				\
+	bpf_iter_num_next(&___it);								\
+	/* nothing here  */									\
+)
+#endif /* bpf_repeat */
+
+#endif	/* __SCHED_EXT_COMMON_BPF_H */
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
new file mode 100644
index 0000000000000..60e260577a3a1
--- /dev/null
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple five-level FIFO queue scheduler.
+ *
+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
+ * assigned to one depending on its compound weight. Each CPU round robins
+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
+ * queue0, 2 from queue1, 4 from queue2 and so on.
+ *
+ * This scheduler demonstrates:
+ *
+ * - BPF-side queueing using PIDs.
+ * - Sleepable per-task storage allocation using ops.prep_enable().
+ *
+ * This scheduler is primarily for demonstration and testing of sched_ext
+ * features and unlikely to be useful for actual workloads.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+#include <linux/sched/prio.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+
+u32 test_error_cnt;
+
+struct user_exit_info uei;
+
+struct qmap {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, u32);
+} queue0 SEC(".maps"),
+  queue1 SEC(".maps"),
+  queue2 SEC(".maps"),
+  queue3 SEC(".maps"),
+  queue4 SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, 5);
+	__type(key, int);
+	__array(values, struct qmap);
+} queue_arr SEC(".maps") = {
+	.values = {
+		[0] = &queue0,
+		[1] = &queue1,
+		[2] = &queue2,
+		[3] = &queue3,
+		[4] = &queue4,
+	},
+};
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool	force_local;	/* Dispatch directly to local_dsq */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/* Per-cpu dispatch index and remaining count */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, u32);
+	__type(value, u64);
+} dispatch_idx_cnt SEC(".maps");
+
+/* Statistics */
+unsigned long nr_enqueued, nr_dispatched, nr_dequeued;
+
+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+	s32 cpu;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return -ESRCH;
+	}
+
+	if (p->nr_cpus_allowed == 1 ||
+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+		tctx->force_local = true;
+		return prev_cpu;
+	}
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+	if (cpu >= 0)
+		return cpu;
+
+	return prev_cpu;
+}
+
+static int weight_to_idx(u32 weight)
+{
+	/* Coarsely map the compound weight to a FIFO. */
+	if (weight <= 25)
+		return 0;
+	else if (weight <= 50)
+		return 1;
+	else if (weight < 200)
+		return 2;
+	else if (weight < 400)
+		return 3;
+	else
+		return 4;
+}
+
+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+	u32 pid = p->pid;
+	int idx = weight_to_idx(p->scx.weight);
+	void *ring;
+
+	if (test_error_cnt && !--test_error_cnt)
+		scx_bpf_error("test triggering error");
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	/* Is select_cpu() is telling us to enqueue locally? */
+	if (tctx->force_local) {
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+		return;
+	}
+
+	ring = bpf_map_lookup_elem(&queue_arr, &idx);
+	if (!ring) {
+		scx_bpf_error("failed to find ring %d", idx);
+		return;
+	}
+
+	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
+	if (bpf_map_push_elem(ring, &pid, 0)) {
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
+		return;
+	}
+
+	__sync_fetch_and_add(&nr_enqueued, 1);
+}
+
+/*
+ * The BPF queue map doesn't support removal and sched_ext can handle spurious
+ * dispatches. qmap_dequeue() is only used to collect statistics.
+ */
+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	__sync_fetch_and_add(&nr_dequeued, 1);
+}
+
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
+{
+	u32 zero = 0, one = 1;
+	u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero);
+	u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one);
+	void *fifo;
+	s32 pid;
+	int i;
+
+	if (!idx || !cnt) {
+		scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt);
+		return;
+	}
+
+	for (i = 0; i < 5; i++) {
+		/* Advance the dispatch cursor and pick the fifo. */
+		if (!*cnt) {
+			*idx = (*idx + 1) % 5;
+			*cnt = 1 << *idx;
+		}
+		(*cnt)--;
+
+		fifo = bpf_map_lookup_elem(&queue_arr, idx);
+		if (!fifo) {
+			scx_bpf_error("failed to find ring %llu", *idx);
+			return;
+		}
+
+		/* Dispatch or advance. */
+		if (!bpf_map_pop_elem(fifo, &pid)) {
+			struct task_struct *p;
+
+			p = bpf_task_from_pid(pid);
+			if (p) {
+				__sync_fetch_and_add(&nr_dispatched, 1);
+				scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
+				bpf_task_release(p);
+				return;
+			}
+		}
+
+		*cnt = 0;
+	}
+}
+
+s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	/*
+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
+	 * in this function and the following will automatically use GFP_KERNEL.
+	 */
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops qmap_ops = {
+	.select_cpu		= (void *)qmap_select_cpu,
+	.enqueue		= (void *)qmap_enqueue,
+	.dequeue		= (void *)qmap_dequeue,
+	.dispatch		= (void *)qmap_dispatch,
+	.prep_enable		= (void *)qmap_prep_enable,
+	.exit			= (void *)qmap_exit,
+	.name			= "qmap",
+};
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
new file mode 100644
index 0000000000000..56c85c9fa979d
--- /dev/null
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_qmap.skel.h"
+
+const char help_fmt[] =
+"A simple five-level FIFO queue sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-e COUNT]\n"
+"\n"
+"  -s SLICE_US   Override slice duration\n"
+"  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_qmap *skel;
+	struct bpf_link *link;
+	int opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_qmap__open();
+	assert(skel);
+
+	while ((opt = getopt(argc, argv, "s:e:tTd:h")) != -1) {
+		switch (opt) {
+		case 's':
+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
+		case 'e':
+			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	assert(!scx_example_qmap__load(skel));
+
+	link = bpf_map__attach_struct_ops(skel->maps.qmap_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		long nr_enqueued = skel->bss->nr_enqueued;
+		long nr_dispatched = skel->bss->nr_dispatched;
+
+		printf("enq=%lu, dsp=%lu, delta=%ld, deq=%lu\n",
+		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
+		       skel->bss->nr_dequeued);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_qmap__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
new file mode 100644
index 0000000000000..74716d0dd08d3
--- /dev/null
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple scheduler.
+ *
+ * A simple global FIFO scheduler. It also demonstrates the following niceties.
+ *
+ * - Statistics tracking how many tasks are queued to local and global dsq's.
+ * - Termination notification for userspace.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct user_exit_info uei;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+	__uint(max_entries, 2);			/* [local, global] */
+} stats SEC(".maps");
+
+static void stat_inc(u32 idx)
+{
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+	if (cnt_p)
+		(*cnt_p)++;
+}
+
+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (enq_flags & SCX_ENQ_LOCAL) {
+		stat_inc(0);	/* count local queueing */
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		return;
+	}
+
+	stat_inc(1);	/* count global queueing */
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops simple_ops = {
+	.enqueue		= (void *)simple_enqueue,
+	.exit			= (void *)simple_exit,
+	.name			= "simple",
+};
diff --git a/tools/sched_ext/scx_example_simple.c b/tools/sched_ext/scx_example_simple.c
new file mode 100644
index 0000000000000..2f1ee40f7e5a1
--- /dev/null
+++ b/tools/sched_ext/scx_example_simple.c
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_simple.skel.h"
+
+const char help_fmt[] =
+"A simple sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s\n"
+"\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int simple)
+{
+	exit_req = 1;
+}
+
+static void read_stats(struct scx_example_simple *skel, u64 *stats)
+{
+	int nr_cpus = libbpf_num_possible_cpus();
+	u64 cnts[2][nr_cpus];
+	u32 idx;
+
+	memset(stats, 0, sizeof(stats[0]) * 2);
+
+	for (idx = 0; idx < 2; idx++) {
+		int ret, cpu;
+
+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+					  &idx, cnts[idx]);
+		if (ret < 0)
+			continue;
+		for (cpu = 0; cpu < nr_cpus; cpu++)
+			stats[idx] += cnts[idx][cpu];
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_simple *skel;
+	struct bpf_link *link;
+	u32 opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_simple__open();
+	assert(skel);
+
+	while ((opt = getopt(argc, argv, "h")) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	assert(!scx_example_simple__load(skel));
+
+	link = bpf_map__attach_struct_ops(skel->maps.simple_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		u64 stats[2];
+
+		read_stats(skel, stats);
+		printf("local=%lu global=%lu\n", stats[0], stats[1]);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_simple__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/user_exit_info.h
new file mode 100644
index 0000000000000..e701ef0e0b86c
--- /dev/null
+++ b/tools/sched_ext/user_exit_info.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_H
+#define __USER_EXIT_INFO_H
+
+struct user_exit_info {
+	int		type;
+	char		reason[128];
+	char		msg[1024];
+};
+
+#ifdef __bpf__
+
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
+
+static inline void uei_record(struct user_exit_info *uei,
+			      const struct scx_exit_info *ei)
+{
+	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
+	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
+	/* use __sync to force memory barrier */
+	__sync_val_compare_and_swap(&uei->type, uei->type, ei->type);
+}
+
+#else	/* !__bpf__ */
+
+static inline bool uei_exited(struct user_exit_info *uei)
+{
+	/* use __sync to force memory barrier */
+	return __sync_val_compare_and_swap(&uei->type, -1, -1);
+}
+
+static inline void uei_print(const struct user_exit_info *uei)
+{
+	fprintf(stderr, "EXIT: %s", uei->reason);
+	if (uei->msg[0] != '\0')
+		fprintf(stderr, " (%s)", uei->msg);
+	fputs("\n", stderr);
+}
+
+#endif	/* __bpf__ */
+#endif	/* __USER_EXIT_INFO_H */

From 99e6b104e3aad63ed4203ad67251174c5f25fe60 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 015/304] sched_ext: Add sysrq-S which disables the BPF
 scheduler

This enables the admin to abort the BPF scheduler and revert to CFS anytime.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 drivers/tty/sysrq.c         |  1 +
 include/linux/sched/ext.h   |  1 +
 kernel/sched/build_policy.c |  1 +
 kernel/sched/ext.c          | 20 ++++++++++++++++++++
 4 files changed, 23 insertions(+)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b6e70c5cfa174..ddfcdb6aecd77 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
 	NULL,				/* P */
 	NULL,				/* Q */
 	NULL,				/* R */
+	/* S: May be registered by sched_ext for resetting */
 	NULL,				/* S */
 	NULL,				/* T */
 	NULL,				/* U */
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 45bf24a23c610..4b6b9386e2f89 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -55,6 +55,7 @@ enum scx_exit_type {
 	SCX_EXIT_DONE,
 
 	SCX_EXIT_UNREG = 64,	/* BPF unregistration */
+	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
 
 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 4c658b21f603c..005025f55beaa 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tsacct_kern.h>
 #include <linux/vtime.h>
+#include <linux/sysrq.h>
 #include <linux/percpu-rwsem.h>
 
 #include <uapi/linux/sched/types.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8e778d8ec59c9..781d2e4c9c5dc 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1930,6 +1930,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	case SCX_EXIT_UNREG:
 		reason = "BPF scheduler unregistered";
 		break;
+	case SCX_EXIT_SYSRQ:
+		reason = "disabled by sysrq-S";
+		break;
 	case SCX_EXIT_ERROR:
 		reason = "runtime error";
 		break;
@@ -2533,6 +2536,21 @@ struct bpf_struct_ops bpf_sched_ext_ops = {
 	.name = "sched_ext_ops",
 };
 
+static void sysrq_handle_sched_ext_reset(int key)
+{
+	if (scx_ops_helper)
+		scx_ops_disable(SCX_EXIT_SYSRQ);
+	else
+		pr_info("sched_ext: BPF scheduler not yet used\n");
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
+	.handler	= sysrq_handle_sched_ext_reset,
+	.help_msg	= "reset-sched-ext(S)",
+	.action_msg	= "Disable sched_ext and revert all tasks to CFS",
+	.enable_mask	= SYSRQ_ENABLE_RTNICE,
+};
+
 void __init init_sched_ext_class(void)
 {
 	int cpu;
@@ -2556,6 +2574,8 @@ void __init init_sched_ext_class(void)
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 	}
+
+	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
 }
 
 

From f8a72218c89fb5443fea5e3e3830130811b2107d Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 016/304] sched_ext: Implement runnable task stall watchdog

The most common and critical way that a BPF scheduler can misbehave is by
failing to run runnable tasks for too long. This patch implements a
watchdog.

* All tasks record when they become runnable.

* A watchdog work periodically scans all runnable tasks. If any task has
  stayed runnable for too long, the BPF scheduler is aborted.

* scheduler_tick() monitors whether the watchdog itself is stuck. If so, the
  BPF scheduler is aborted.

Because the watchdog only scans the tasks which are currently runnable and
usually very infrequently, the overhead should be negligible.
scx_example_qmap is updated so that it can be told to stall user and/or
kernel tasks.

A detected task stall looks like the following:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s)
    scx_check_timeout_workfn+0x10e/0x1b0
    process_one_work+0x287/0x560
    worker_thread+0x234/0x420
    kthread+0xe9/0x100
    ret_from_fork+0x1f/0x30

A detected watchdog stall:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (watchdog failed to check in for 5.001s)
    scheduler_tick+0x2eb/0x340
    update_process_times+0x7a/0x90
    tick_sched_timer+0xd8/0x130
    __hrtimer_run_queues+0x178/0x3b0
    hrtimer_interrupt+0xfc/0x390
    __sysvec_apic_timer_interrupt+0xb7/0x2b0
    sysvec_apic_timer_interrupt+0x90/0xb0
    asm_sysvec_apic_timer_interrupt+0x1b/0x20
    default_idle+0x14/0x20
    arch_cpu_idle+0xf/0x20
    default_idle_call+0x50/0x90
    do_idle+0xe8/0x240
    cpu_startup_entry+0x1d/0x20
    kernel_init+0x0/0x190
    start_kernel+0x0/0x392
    start_kernel+0x324/0x392
    x86_64_start_reservations+0x2a/0x2c
    x86_64_start_kernel+0x104/0x109
    secondary_startup_64_no_verify+0xce/0xdb

Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to
inline scx_notify_sched_tick().

v2: Julia Lawall noticed that the watchdog code was mixing msecs and
    jiffies. Fix by using jiffies for everything.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
---
 include/linux/sched/ext.h              |  13 +++
 init/init_task.c                       |   2 +
 kernel/sched/core.c                    |   3 +
 kernel/sched/ext.c                     | 128 +++++++++++++++++++++++--
 kernel/sched/ext.h                     |  25 +++++
 kernel/sched/sched.h                   |   1 +
 tools/sched_ext/scx_example_qmap.bpf.c |  12 +++
 tools/sched_ext/scx_example_qmap.c     |  12 ++-
 8 files changed, 185 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 4b6b9386e2f89..7a4d088a23780 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -59,6 +59,7 @@ enum scx_exit_type {
 
 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
+	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
 };
 
 /*
@@ -315,6 +316,15 @@ struct sched_ext_ops {
 	 */
 	u64 flags;
 
+	/**
+	 * timeout_ms - The maximum amount of time, in milliseconds, that a
+	 * runnable task should be able to wait before being scheduled. The
+	 * maximum timeout may not exceed the default timeout of 30 seconds.
+	 *
+	 * Defaults to the maximum allowed timeout value of 30 seconds.
+	 */
+	u32 timeout_ms;
+
 	/**
 	 * name - BPF scheduler's name
 	 *
@@ -348,6 +358,7 @@ enum scx_ent_flags {
 	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
 	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
 
+	SCX_TASK_WATCHDOG_RESET = 1 << 16, /* task watchdog counter should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 17, /* last dequeue was for SLEEP */
 
 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
@@ -381,12 +392,14 @@ enum scx_kf_mask {
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
 	struct list_head	dsq_node;
+	struct list_head	watchdog_node;
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
 	s32			sticky_cpu;
 	s32			holding_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
 	atomic64_t		ops_state;
+	unsigned long		runnable_at;
 
 	/* BPF scheduler modifiable fields */
 
diff --git a/init/init_task.c b/init/init_task.c
index bdbc663107bfc..913194aab6232 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -106,9 +106,11 @@ struct task_struct init_task
 #ifdef CONFIG_SCHED_CLASS_EXT
 	.scx		= {
 		.dsq_node	= LIST_HEAD_INIT(init_task.scx.dsq_node),
+		.watchdog_node	= LIST_HEAD_INIT(init_task.scx.watchdog_node),
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
 		.ops_state	= ATOMIC_INIT(0),
+		.runnable_at	= INITIAL_JIFFIES,
 		.slice		= SCX_SLICE_DFL,
 	},
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a3fb6a05d1313..9f721df512f0f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4483,12 +4483,14 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #ifdef CONFIG_SCHED_CLASS_EXT
 	p->scx.dsq		= NULL;
 	INIT_LIST_HEAD(&p->scx.dsq_node);
+	INIT_LIST_HEAD(&p->scx.watchdog_node);
 	p->scx.flags		= 0;
 	p->scx.weight		= 0;
 	p->scx.sticky_cpu	= -1;
 	p->scx.holding_cpu	= -1;
 	p->scx.kf_mask		= 0;
 	atomic64_set(&p->scx.ops_state, 0);
+	p->scx.runnable_at	= INITIAL_JIFFIES;
 	p->scx.slice		= SCX_SLICE_DFL;
 #endif
 
@@ -5651,6 +5653,7 @@ void scheduler_tick(void)
 	if (sched_feat(LATENCY_WARN) && resched_latency)
 		resched_latency_warn(cpu, resched_latency);
 
+	scx_notify_sched_tick();
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 781d2e4c9c5dc..a8a79783733c3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,7 @@
 enum scx_internal_consts {
 	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
 	SCX_DSP_DFL_MAX_BATCH	= 32,
+	SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
 };
 
 enum scx_ops_enable_state {
@@ -87,6 +88,23 @@ static struct scx_exit_info scx_exit_info;
 
 static atomic64_t scx_nr_rejected = ATOMIC64_INIT(0);
 
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
 /* idle tracking */
 #ifdef CONFIG_SMP
 #ifdef CONFIG_CPUMASK_OFFSTACK
@@ -146,10 +164,6 @@ static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
 
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 		      u64 enq_flags);
-__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
-					      const char *fmt, ...);
-#define scx_ops_error(fmt, args...)						\
-	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
 struct scx_task_iter {
 	struct sched_ext_entity		cursor;
@@ -699,6 +713,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
 }
 
+static bool watchdog_task_watched(const struct task_struct *p)
+{
+	return !list_empty(&p->scx.watchdog_node);
+}
+
+static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_rq_held(rq);
+	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
+		p->scx.runnable_at = jiffies;
+	p->scx.flags &= ~SCX_TASK_WATCHDOG_RESET;
+	list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
+}
+
+static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
+{
+	list_del_init(&p->scx.watchdog_node);
+	if (reset_timeout)
+		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+}
+
 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
 {
 	int sticky_cpu = p->scx.sticky_cpu;
@@ -717,9 +752,12 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
 		sticky_cpu = cpu_of(rq);
 
-	if (p->scx.flags & SCX_TASK_QUEUED)
+	if (p->scx.flags & SCX_TASK_QUEUED) {
+		WARN_ON_ONCE(!watchdog_task_watched(p));
 		return;
+	}
 
+	watchdog_watch_task(rq, p);
 	p->scx.flags |= SCX_TASK_QUEUED;
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
@@ -731,6 +769,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 {
 	u64 opss;
 
+	watchdog_unwatch_task(p, false);
+
 	/* acquire ensures that we see the preceding updates on QUEUED */
 	opss = atomic64_read_acquire(&p->scx.ops_state);
 
@@ -775,8 +815,10 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 {
 	struct scx_rq *scx_rq = &rq->scx;
 
-	if (!(p->scx.flags & SCX_TASK_QUEUED))
+	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+		WARN_ON_ONCE(watchdog_task_watched(p));
 		return;
+	}
 
 	ops_dequeue(p, deq_flags);
 
@@ -1300,6 +1342,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 	}
 
 	p->se.exec_start = rq_clock_task(rq);
+
+	watchdog_unwatch_task(p, true);
 }
 
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1343,11 +1387,14 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 	 */
 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
+		watchdog_watch_task(rq, p);
 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
 		return;
 	}
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
+		watchdog_watch_task(rq, p);
+
 		/*
 		 * If @p has slice left and balance_scx() didn't tag it for
 		 * keeping, @p is getting preempted by a higher priority
@@ -1576,6 +1623,49 @@ static void reset_idle_masks(void) {}
 
 #endif /* CONFIG_SMP */
 
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+	struct task_struct *p;
+	struct rq_flags rf;
+	bool timed_out = false;
+
+	rq_lock_irqsave(rq, &rf);
+	list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) {
+		unsigned long last_runnable = p->scx.runnable_at;
+
+		if (unlikely(time_after(jiffies,
+					last_runnable + scx_watchdog_timeout))) {
+			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+			scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+					   "%s[%d] failed to run for %u.%03us",
+					   p->comm, p->pid,
+					   dur_ms / 1000, dur_ms % 1000);
+			timed_out = true;
+			break;
+		}
+	}
+	rq_unlock_irqrestore(rq, &rf);
+
+	return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+	int cpu;
+
+	scx_watchdog_timestamp = jiffies;
+
+	for_each_online_cpu(cpu) {
+		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+			break;
+
+		cond_resched();
+	}
+	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+			   scx_watchdog_timeout / 2);
+}
+
 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 {
 	update_curr_scx(rq);
@@ -1607,7 +1697,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		}
 	}
 
-	p->scx.flags |= SCX_TASK_OPS_PREPPED;
+	p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
 	return 0;
 }
 
@@ -1926,6 +2016,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 			break;
 	}
 
+	cancel_delayed_work_sync(&scx_watchdog_work);
+
 	switch (type) {
 	case SCX_EXIT_UNREG:
 		reason = "BPF scheduler unregistered";
@@ -1939,6 +2031,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	case SCX_EXIT_ERROR_BPF:
 		reason = "scx_bpf_error";
 		break;
+	case SCX_EXIT_ERROR_STALL:
+		reason = "runnable task stall";
+		break;
 	default:
 		reason = "<UNKNOWN>";
 	}
@@ -2123,8 +2218,8 @@ static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 
 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
 
-__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
-					      const char *fmt, ...)
+__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+				       const char *fmt, ...)
 {
 	struct scx_exit_info *ei = &scx_exit_info;
 	int none = SCX_EXIT_NONE;
@@ -2223,6 +2318,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable;
 	}
 
+	scx_watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+	if (ops->timeout_ms)
+		scx_watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+
+	scx_watchdog_timestamp = jiffies;
+	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+			   scx_watchdog_timeout / 2);
+
 	/*
 	 * Lock out forks before opening the floodgate so that they don't wander
 	 * into the operations prematurely.
@@ -2476,6 +2579,11 @@ static int bpf_scx_init_member(const struct btf_type *t,
 		if (ret == 0)
 			return -EINVAL;
 		return 1;
+	case offsetof(struct sched_ext_ops, timeout_ms):
+		if (*(u32 *)(udata + moff) > SCX_WATCHDOG_MAX_TIMEOUT)
+			return -E2BIG;
+		ops->timeout_ms = *(u32 *)(udata + moff);
+		return 1;
 	}
 
 	return 0;
@@ -2573,9 +2681,11 @@ void __init init_sched_ext_class(void)
 		struct rq *rq = cpu_rq(cpu);
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+		INIT_LIST_HEAD(&rq->scx.watchdog_list);
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
 }
 
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index f8d5682deacfd..7dfa7b888487b 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -56,6 +56,8 @@ enum scx_deq_flags {
 extern const struct sched_class ext_sched_class;
 extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
 extern const struct file_operations sched_ext_fops;
+extern unsigned long scx_watchdog_timeout;
+extern unsigned long scx_watchdog_timestamp;
 
 DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
@@ -67,6 +69,28 @@ void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
 void init_sched_ext_class(void);
 
+__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+				       const char *fmt, ...);
+#define scx_ops_error(fmt, args...)						\
+	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
+
+static inline void scx_notify_sched_tick(void)
+{
+	unsigned long last_check;
+
+	if (!scx_enabled())
+		return;
+
+	last_check = scx_watchdog_timestamp;
+	if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) {
+		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+		scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+				   "watchdog failed to check in for %u.%03us",
+				   dur_ms / 1000, dur_ms % 1000);
+	}
+}
+
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
@@ -98,6 +122,7 @@ static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
 static inline void init_sched_ext_class(void) {}
+static inline void scx_notify_sched_tick(void) {}
 
 #define for_each_active_class		for_each_class
 #define for_balance_class_range		for_class_range
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae4cd306bf287..bd9851ee02570 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -688,6 +688,7 @@ struct cfs_rq {
 #ifdef CONFIG_SCHED_CLASS_EXT
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
+	struct list_head	watchdog_list;
 	u64			ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 60e260577a3a1..2a969c68a2e49 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -25,6 +25,8 @@
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 stall_user_nth;
+const volatile u32 stall_kernel_nth;
 
 u32 test_error_cnt;
 
@@ -120,11 +122,20 @@ static int weight_to_idx(u32 weight)
 
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
+	static u32 user_cnt, kernel_cnt;
 	struct task_ctx *tctx;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
 
+	if (p->flags & PF_KTHREAD) {
+		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
+			return;
+	} else {
+		if (stall_user_nth && !(++user_cnt % stall_user_nth))
+			return;
+	}
+
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
 
@@ -237,5 +248,6 @@ struct sched_ext_ops qmap_ops = {
 	.dispatch		= (void *)qmap_dispatch,
 	.prep_enable		= (void *)qmap_prep_enable,
 	.exit			= (void *)qmap_exit,
+	.timeout_ms		= 5000U,
 	.name			= "qmap",
 };
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 56c85c9fa979d..3f98534c2a9c9 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,10 +20,12 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
+"  -t COUNT      Stall every COUNT'th user thread\n"
+"  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -47,7 +49,7 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "s:e:tTd:h")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:d:h")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -55,6 +57,12 @@ int main(int argc, char **argv)
 		case 'e':
 			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
 			break;
+		case 't':
+			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
+			break;
+		case 'T':
+			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';

From 4a93fea69bfe3f4bbe41a7d0a8759aa71cd8e215 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 017/304] sched_ext: Allow BPF schedulers to disallow specific
 tasks from joining SCHED_EXT

BPF schedulers might not want to schedule certain tasks - e.g. kernel
threads. This patch adds p->scx.disallow which can be set by BPF schedulers
in such cases. The field can be changed anytime and setting it in
ops.prep_enable() guarantees that the task can never be scheduled by
sched_ext.

scx_example_qmap is updated with the -d option to disallow a specific PID:

  # echo $$
  1092
  # egrep '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  # egrep '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    7
  ext.enabled                                  :                    0

Run "scx_example_qmap -d 1092" in another terminal.

  # grep rejected /sys/kernel/debug/sched/ext
  nr_rejected                   : 1
  # egrep '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  setparam failed for 1092 (Permission denied)

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h              | 12 ++++++++
 kernel/sched/core.c                    |  4 +++
 kernel/sched/ext.c                     | 38 ++++++++++++++++++++++++++
 kernel/sched/ext.h                     |  3 ++
 tools/sched_ext/scx_example_qmap.bpf.c |  4 +++
 tools/sched_ext/scx_example_qmap.c     |  8 +++++-
 6 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 7a4d088a23780..da85bc3751ad9 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -411,6 +411,18 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * If set, reject future sched_setscheduler(2) calls updating the policy
+	 * to %SCHED_EXT with -%EACCES.
+	 *
+	 * If set from ops.prep_enable() and the task's policy is already
+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
+	 * or by inhering the parent's policy during fork, the task's policy is
+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of such
+	 * events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+	 */
+	bool			disallow;	/* reject switching into SCX */
+
 	/* cold fields */
 	struct list_head	tasks_node;
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9f721df512f0f..08a72f146f9d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7665,6 +7665,10 @@ static int __sched_setscheduler(struct task_struct *p,
 		goto unlock;
 	}
 
+	retval = scx_check_setscheduler(p, policy);
+	if (retval)
+		goto unlock;
+
 	/*
 	 * If not changing anything there's no need to proceed further,
 	 * but store a possible modification of reset_on_fork.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a8a79783733c3..90281532446e4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1687,6 +1687,8 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 
 	WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED);
 
+	p->scx.disallow = false;
+
 	if (SCX_HAS_OP(prep_enable)) {
 		struct scx_enable_args args = { };
 
@@ -1697,6 +1699,27 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		}
 	}
 
+	if (p->scx.disallow) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		rq = task_rq_lock(p, &rf);
+
+		/*
+		 * We're either in fork or load path and @p->policy will be
+		 * applied right after. Reverting @p->policy here and rejecting
+		 * %SCHED_EXT transitions from scx_check_setscheduler()
+		 * guarantees that if ops.prep_enable() sets @p->disallow, @p
+		 * can never be in SCX.
+		 */
+		if (p->policy == SCHED_EXT) {
+			p->policy = SCHED_NORMAL;
+			atomic64_inc(&scx_nr_rejected);
+		}
+
+		task_rq_unlock(rq, p, &rf);
+	}
+
 	p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
 	return 0;
 }
@@ -1845,6 +1868,18 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+	lockdep_assert_rq_held(task_rq(p));
+
+	/* if disallow, reject transitioning into SCX */
+	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+	    p->policy != policy && policy == SCHED_EXT)
+		return -EACCES;
+
+	return 0;
+}
+
 /*
  * Omitted operations:
  *
@@ -2527,6 +2562,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 		if (off >= offsetof(struct task_struct, scx.slice) &&
 		    off + size <= offsetofend(struct task_struct, scx.slice))
 			return SCALAR_VALUE;
+		if (off >= offsetof(struct task_struct, scx.disallow) &&
+		    off + size <= offsetofend(struct task_struct, scx.disallow))
+			return SCALAR_VALUE;
 	}
 
 	return 0;
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 7dfa7b888487b..76c94babd19e7 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -67,6 +67,7 @@ void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
+int scx_check_setscheduler(struct task_struct *p, int policy);
 void init_sched_ext_class(void);
 
 __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -121,6 +122,8 @@ static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline int scx_check_setscheduler(struct task_struct *p,
+					 int policy) { return 0; }
 static inline void init_sched_ext_class(void) {}
 static inline void scx_notify_sched_tick(void) {}
 
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 2a969c68a2e49..0e4cccf878f56 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -27,6 +27,7 @@ char _license[] SEC("license") = "GPL";
 const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
+const volatile s32 disallow_tgid;
 
 u32 test_error_cnt;
 
@@ -224,6 +225,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
+	if (p->tgid == disallow_tgid)
+		p->scx.disallow = true;
+
 	/*
 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
 	 * in this function and the following will automatically use GFP_KERNEL.
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 3f98534c2a9c9..d080a0c853c09 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,12 +20,13 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
+"  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -63,6 +64,11 @@ int main(int argc, char **argv)
 		case 'T':
 			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
 			break;
+		case 'd':
+			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
+			if (skel->rodata->disallow_tgid < 0)
+				skel->rodata->disallow_tgid = getpid();
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';

From fcbc9acbbc1c2abab7e0b10c3140ce13d38d1a59 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:06 -1000
Subject: [PATCH 018/304] sched_ext: Allow BPF schedulers to switch all
 eligible tasks into sched_ext

Currently, to use sched_ext, each task has to be put into sched_ext using
sched_setscheduler(2). However, some BPF schedulers and use cases might
prefer to service all eligible tasks.

This patch adds a new kfunc helper, scx_bpf_switch_all(), that BPF
schedulers can call from ops.init() to switch all SCHED_NORMAL, SCHED_BATCH
and SCHED_IDLE tasks into sched_ext. This has the benefit that the scheduler
swaps are transparent to the users and applications. As we know that CFS is
not being used when scx_bpf_switch_all() is used, we can also disable hot
path entry points with static_branches.

Both the simple and qmap example schedulers are updated to switch all tasks
by default to ease testing. '-p' option is added which enables the original
behavior of switching only tasks which are explicitly on SCHED_EXT.

v2: In the example schedulers, switch all tasks by default.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c                      |  8 +++--
 kernel/sched/ext.c                       | 45 ++++++++++++++++++++++++
 kernel/sched/ext.h                       |  5 +++
 tools/sched_ext/scx_common.bpf.h         |  1 +
 tools/sched_ext/scx_example_qmap.bpf.c   |  9 +++++
 tools/sched_ext/scx_example_qmap.c       |  8 +++--
 tools/sched_ext/scx_example_simple.bpf.c | 10 ++++++
 tools/sched_ext/scx_example_simple.c     |  8 +++--
 8 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 08a72f146f9d4..bc90327f950df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1206,7 +1206,7 @@ bool sched_can_stop_tick(struct rq *rq)
 	 * if there's more than one we need the tick for involuntary
 	 * preemption.
 	 */
-	if (rq->nr_running > 1)
+	if (!scx_switched_all() && rq->nr_running > 1)
 		return false;
 
 	return true;
@@ -5657,8 +5657,10 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_balance = idle_cpu(cpu);
-	trigger_load_balance(rq);
+	if (!scx_switched_all()) {
+		rq->idle_balance = idle_cpu(cpu);
+		trigger_load_balance(rq);
+	}
 #endif
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 90281532446e4..096894ea8e261 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -73,6 +73,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static bool scx_switch_all_req;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
 static struct sched_ext_ops scx_ops;
 static bool scx_warned_zero_slice;
 
@@ -2015,6 +2019,8 @@ bool task_on_scx(struct task_struct *p)
 {
 	if (!scx_enabled() || scx_ops_disabling())
 		return false;
+	if (READ_ONCE(scx_switching_all))
+		return true;
 	return p->policy == SCHED_EXT;
 }
 
@@ -2141,6 +2147,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	 */
 	mutex_lock(&scx_ops_enable_mutex);
 
+	static_branch_disable(&__scx_switched_all);
+	WRITE_ONCE(scx_switching_all, false);
+
 	/* avoid racing against fork */
 	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
@@ -2325,6 +2334,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	cpus_read_lock();
 
+	scx_switch_all_req = false;
 	if (scx_ops.init) {
 		ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
 		if (ret) {
@@ -2440,6 +2450,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 * transitions here are synchronized against sched_ext_free() through
 	 * scx_tasks_lock.
 	 */
+	WRITE_ONCE(scx_switching_all, scx_switch_all_req);
+
 	scx_task_iter_init(&sti);
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
@@ -2471,6 +2483,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable;
 	}
 
+	if (scx_switch_all_req)
+		static_branch_enable_cpuslocked(&__scx_switched_all);
+
 	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
@@ -2505,6 +2520,9 @@ static int scx_debug_show(struct seq_file *m, void *v)
 	mutex_lock(&scx_ops_enable_mutex);
 	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
 	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
+	seq_printf(m, "%-30s: %d\n", "switching_all",
+		   READ_ONCE(scx_switching_all));
+	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
 	seq_printf(m, "%-30s: %s\n", "enable_state",
 		   scx_ops_enable_state_str[scx_ops_enable_state()]);
 	seq_printf(m, "%-30s: %llu\n", "nr_rejected",
@@ -2737,6 +2755,31 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
 
+/**
+ * scx_bpf_switch_all - Switch all tasks into SCX
+ * @into_scx: switch direction
+ *
+ * If @into_scx is %true, all existing and future non-dl/rt tasks are switched
+ * to SCX. If %false, only tasks which have %SCHED_EXT explicitly set are put on
+ * SCX. The actual switching is asynchronous. Can be called from ops.init().
+ */
+void scx_bpf_switch_all(void)
+{
+	if (!scx_kf_allowed(SCX_KF_INIT))
+		return;
+
+	scx_switch_all_req = true;
+}
+
+BTF_SET8_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_switch_all)
+BTF_SET8_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_init,
+};
+
 /**
  * scx_bpf_create_dsq - Create a custom DSQ
  * @dsq_id: DSQ to create
@@ -3188,6 +3231,8 @@ static int __init register_ext_kfuncs(void)
 	 * check using scx_kf_allowed().
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_init)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_sleepable)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_enqueue_dispatch)) ||
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 76c94babd19e7..a4fe649e649d1 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -60,7 +60,9 @@ extern unsigned long scx_watchdog_timeout;
 extern unsigned long scx_watchdog_timestamp;
 
 DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
 bool task_on_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
@@ -95,6 +97,8 @@ static inline void scx_notify_sched_tick(void)
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
+	if (scx_switched_all() && class == &fair_sched_class)
+		class++;
 	if (!scx_enabled() && class == &ext_sched_class)
 		class++;
 	return class;
@@ -117,6 +121,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
+#define scx_switched_all()	false
 
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index d6cb32980dfd4..e3a046e565d00 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -52,6 +52,7 @@ void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
 	___scx_bpf_error_format_checker(fmt, ##args);				\
 })
 
+void scx_bpf_switch_all(void) __ksym;
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 0e4cccf878f56..abb134fb18cea 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -25,6 +25,7 @@
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile bool switch_partial;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile s32 disallow_tgid;
@@ -239,6 +240,13 @@ s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		return -ENOMEM;
 }
 
+s32 BPF_STRUCT_OPS(qmap_init)
+{
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 {
 	uei_record(&uei, ei);
@@ -251,6 +259,7 @@ struct sched_ext_ops qmap_ops = {
 	.dequeue		= (void *)qmap_dequeue,
 	.dispatch		= (void *)qmap_dispatch,
 	.prep_enable		= (void *)qmap_prep_enable,
+	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
 	.timeout_ms		= 5000U,
 	.name			= "qmap",
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index d080a0c853c09..f94fd39c4ed81 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,13 +20,14 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID] [-p]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -50,7 +51,7 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:d:h")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:d:ph")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -69,6 +70,9 @@ int main(int argc, char **argv)
 			if (skel->rodata->disallow_tgid < 0)
 				skel->rodata->disallow_tgid = getpid();
 			break;
+		case 'p':
+			skel->rodata->switch_partial = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
index 74716d0dd08d3..fa5ae683ace1e 100644
--- a/tools/sched_ext/scx_example_simple.bpf.c
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -15,6 +15,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+const volatile bool switch_partial;
+
 struct user_exit_info uei;
 
 struct {
@@ -43,6 +45,13 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
+s32 BPF_STRUCT_OPS(simple_init)
+{
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 {
 	uei_record(&uei, ei);
@@ -51,6 +60,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 SEC(".struct_ops")
 struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
+	.init			= (void *)simple_init,
 	.exit			= (void *)simple_exit,
 	.name			= "simple",
 };
diff --git a/tools/sched_ext/scx_example_simple.c b/tools/sched_ext/scx_example_simple.c
index 2f1ee40f7e5a1..868fd39e45c7e 100644
--- a/tools/sched_ext/scx_example_simple.c
+++ b/tools/sched_ext/scx_example_simple.c
@@ -19,8 +19,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s\n"
+"Usage: %s [-p]\n"
 "\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -64,8 +65,11 @@ int main(int argc, char **argv)
 	skel = scx_example_simple__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "h")) != -1) {
+	while ((opt = getopt(argc, argv, "ph")) != -1) {
 		switch (opt) {
+		case 'p':
+			skel->rodata->switch_partial = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';

From f7b00197768abebaeef6bdace58742c3728db718 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 019/304] sched_ext: Implement scx_bpf_kick_cpu() and task
 preemption support

It's often useful to wake up and/or trigger reschedule on other CPUs. This
patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to
kick the target CPU into the scheduling path.

As a sched_ext task relinquishes its CPU only after its slice is depleted,
this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the
slice of the target CPU's current task to guarantee that sched_ext's
scheduling path runs on the CPU.

This patch also adds a new example scheduler, scx_example_central, which
demonstrates central scheduling where one CPU is responsible for making all
scheduling decisions in the system. The central CPU makes scheduling
decisions for all CPUs in the system, queues tasks on the appropriate local
dsq's and preempts the worker CPUs. The worker CPUs in turn preempt the
central CPU when it needs tasks to run.

Currently, every CPU depends on its own tick to expire the current task. A
follow-up patch implementing tickless support for sched_ext will allow the
worker CPUs to go full tickless so that they can run completely undisturbed.

v3: * Make scx_example_central switch all tasks by default.

    * Convert to BPF inline iterators.

v2: * Julia Lawall reported that scx_example_central can overflow the
      dispatch buffer and malfunction. As scheduling for other CPUs can't be
      handled by the automatic retry mechanism, fix by implementing an
      explicit overflow and retry handling.

    * Updated to use generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
---
 include/linux/sched/ext.h                 |   4 +
 kernel/sched/ext.c                        |  82 +++++++-
 kernel/sched/ext.h                        |  12 ++
 kernel/sched/sched.h                      |   3 +
 tools/sched_ext/.gitignore                |   1 +
 tools/sched_ext/Makefile                  |   8 +-
 tools/sched_ext/scx_common.bpf.h          |   1 +
 tools/sched_ext/scx_example_central.bpf.c | 225 ++++++++++++++++++++++
 tools/sched_ext/scx_example_central.c     |  93 +++++++++
 9 files changed, 424 insertions(+), 5 deletions(-)
 create mode 100644 tools/sched_ext/scx_example_central.bpf.c
 create mode 100644 tools/sched_ext/scx_example_central.c

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index da85bc3751ad9..ac19b720374d4 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -408,6 +408,10 @@ struct sched_ext_entity {
 	 * scx_bpf_dispatch() but can also be modified directly by the BPF
 	 * scheduler. Automatically decreased by SCX as the task executes. On
 	 * depletion, a scheduling event is triggered.
+	 *
+	 * This value is cleared to zero if the task is preempted by
+	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+	 * task ran. Use p->se.sum_exec_runtime instead.
 	 */
 	u64			slice;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 096894ea8e261..93f59337cfd65 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -497,7 +497,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		}
 	}
 
-	if (enq_flags & SCX_ENQ_HEAD)
+	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
 		list_add(&p->scx.dsq_node, &dsq->fifo);
 	else
 		list_add_tail(&p->scx.dsq_node, &dsq->fifo);
@@ -513,8 +513,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 
 	if (is_local) {
 		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+		bool preempt = false;
 
-		if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
+		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+		    rq->curr->sched_class == &ext_sched_class) {
+			rq->curr->scx.slice = 0;
+			preempt = true;
+		}
+
+		if (preempt || sched_class_above(&ext_sched_class,
+						 rq->curr->sched_class))
 			resched_curr(rq);
 	} else {
 		raw_spin_unlock(&dsq->lock);
@@ -1888,7 +1896,9 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
  * Omitted operations:
  *
  * - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the
- *   task isn't tied to the CPU at that point.
+ *   task isn't tied to the CPU at that point. Preemption is implemented by
+ *   resetting the victim task's slice to 0 and triggering reschedule on the
+ *   target CPU.
  *
  * - migrate_task_rq: Unncessary as task to cpu mapping is transient.
  *
@@ -2715,6 +2725,32 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
 };
 
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+	struct rq *this_rq = this_rq();
+	int this_cpu = cpu_of(this_rq);
+	int cpu;
+
+	for_each_cpu(cpu, this_rq->scx.cpus_to_kick) {
+		struct rq *rq = cpu_rq(cpu);
+		unsigned long flags;
+
+		raw_spin_rq_lock_irqsave(rq, flags);
+
+		if (cpu_online(cpu) || cpu == this_cpu) {
+			if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
+			    rq->curr->sched_class == &ext_sched_class)
+				rq->curr->scx.slice = 0;
+			resched_curr(rq);
+		}
+
+		raw_spin_rq_unlock_irqrestore(rq, flags);
+	}
+
+	cpumask_clear(this_rq->scx.cpus_to_kick);
+	cpumask_clear(this_rq->scx.cpus_to_preempt);
+}
+
 void __init init_sched_ext_class(void)
 {
 	int cpu;
@@ -2738,6 +2774,10 @@ void __init init_sched_ext_class(void)
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 		INIT_LIST_HEAD(&rq->scx.watchdog_list);
+
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
@@ -2974,6 +3014,41 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 	.set			= &scx_kfunc_ids_dispatch,
 };
 
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+	struct rq *rq;
+
+	if (!ops_cpu_valid(cpu)) {
+		scx_ops_error("invalid cpu %d", cpu);
+		return;
+	}
+
+	preempt_disable();
+	rq = this_rq();
+
+	/*
+	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
+	 * rq locks. We can probably be smarter and avoid bouncing if called
+	 * from ops which don't hold a rq lock.
+	 */
+	cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
+	if (flags & SCX_KICK_PREEMPT)
+		cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
+
+	irq_work_queue(&rq->scx.kick_cpus_irq_work);
+	preempt_enable();
+}
+
 /**
  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
  * @dsq_id: id of the DSQ
@@ -3195,6 +3270,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p)
 }
 
 BTF_SET8_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index a4fe649e649d1..0b04626e8ca20 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -19,6 +19,14 @@ enum scx_enq_flags {
 
 	/* high 32bits are SCX specific */
 
+	/*
+	 * Set the following to trigger preemption when calling
+	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+	 * current task is cleared to zero and the CPU is kicked into the
+	 * scheduling path. Implies %SCX_ENQ_HEAD.
+	 */
+	SCX_ENQ_PREEMPT		= 1LLU << 32,
+
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
@@ -51,6 +59,10 @@ enum scx_deq_flags {
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
 };
 
+enum scx_kick_flags {
+	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
+};
+
 #ifdef CONFIG_SCHED_CLASS_EXT
 
 extern const struct sched_class ext_sched_class;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bd9851ee02570..cbdfc7b612258 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -692,6 +692,9 @@ struct scx_rq {
 	u64			ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
+	cpumask_var_t		cpus_to_kick;
+	cpumask_var_t		cpus_to_preempt;
+	struct irq_work		kick_cpus_irq_work;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
 
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index 2ad3d86caf797..3d8ec46ca304f 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -1,5 +1,6 @@
 scx_example_simple
 scx_example_qmap
+scx_example_central
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 8f0f14bb59ff0..bcec7c1fb7b19 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -115,7 +115,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_example_simple scx_example_qmap
+all: scx_example_simple scx_example_qmap scx_example_central
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -174,10 +174,14 @@ scx_example_qmap: scx_example_qmap.c scx_example_qmap.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+scx_example_central: scx_example_central.c scx_example_central.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
 clean:
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-	rm -f scx_example_simple scx_example_qmap
+	rm -f scx_example_simple scx_example_qmap scx_example_central
 
 .PHONY: all clean
 
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index e3a046e565d00..647ec7a99bd01 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -57,6 +57,7 @@ s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
new file mode 100644
index 0000000000000..443504fa68f84
--- /dev/null
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A central FIFO sched_ext scheduler which demonstrates the followings:
+ *
+ * a. Making all scheduling decisions from one CPU:
+ *
+ *    The central CPU is the only one making scheduling decisions. All other
+ *    CPUs kick the central CPU when they run out of tasks to run.
+ *
+ *    There is one global BPF queue and the central CPU schedules all CPUs by
+ *    dispatching from the global queue to each CPU's local dsq from dispatch().
+ *    This isn't the most straightforward. e.g. It'd be easier to bounce
+ *    through per-CPU BPF queues. The current design is chosen to maximally
+ *    utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
+ *
+ * b. Preemption
+ *
+ *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
+ *    next tasks.
+ *
+ * This scheduler is designed to maximize usage of various SCX mechanisms. A
+ * more practical implementation would likely put the scheduling loop outside
+ * the central CPU's dispatch() path and add some form of priority mechanism.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+
+char _license[] SEC("license") = "GPL";
+
+enum {
+	FALLBACK_DSQ_ID		= 0,
+	MAX_CPUS		= 4096,
+	MS_TO_NS		= 1000LLU * 1000,
+	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
+};
+
+const volatile bool switch_partial;
+const volatile s32 central_cpu;
+const volatile u32 nr_cpu_ids = 64;	/* !0 for veristat, set during init */
+
+u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
+u64 nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_overflows;
+
+struct user_exit_info uei;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, s32);
+} central_q SEC(".maps");
+
+/* can't use percpu map due to bad lookups */
+static bool cpu_gimme_task[MAX_CPUS];
+
+struct central_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct central_timer);
+} central_timer SEC(".maps");
+
+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/*
+	 * Steer wakeups to the central CPU as much as possible to avoid
+	 * disturbing other CPUs. It's safe to blindly return the central cpu as
+	 * select_cpu() is a hint and if @p can't be on it, the kernel will
+	 * automatically pick a fallback CPU.
+	 */
+	return central_cpu;
+}
+
+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	s32 pid = p->pid;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	if (bpf_map_push_elem(&central_q, &pid, 0)) {
+		__sync_fetch_and_add(&nr_overflows, 1);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+		return;
+	}
+
+	__sync_fetch_and_add(&nr_queued, 1);
+
+	if (!scx_bpf_task_running(p))
+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+}
+
+static bool dispatch_to_cpu(s32 cpu)
+{
+	struct task_struct *p;
+	s32 pid;
+
+	bpf_repeat(BPF_MAX_LOOPS) {
+		if (bpf_map_pop_elem(&central_q, &pid))
+			break;
+
+		__sync_fetch_and_sub(&nr_queued, 1);
+
+		p = bpf_task_from_pid(pid);
+		if (!p) {
+			__sync_fetch_and_add(&nr_lost_pids, 1);
+			continue;
+		}
+
+		/*
+		 * If we can't run the task at the top, do the dumb thing and
+		 * bounce it to the fallback dsq.
+		 */
+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+			__sync_fetch_and_add(&nr_mismatches, 1);
+			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+			bpf_task_release(p);
+			continue;
+		}
+
+		/* dispatch to local and mark that @cpu doesn't need more */
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+
+		if (cpu != central_cpu)
+			scx_bpf_kick_cpu(cpu, 0);
+
+		bpf_task_release(p);
+		return true;
+	}
+
+	return false;
+}
+
+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (cpu == central_cpu) {
+		/* dispatch for all other CPUs first */
+		__sync_fetch_and_add(&nr_dispatches, 1);
+
+		bpf_for(cpu, 0, nr_cpu_ids) {
+			bool *gimme;
+
+			if (!scx_bpf_dispatch_nr_slots())
+				break;
+
+			/* central's gimme is never set */
+			gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+			if (gimme && !*gimme)
+				continue;
+
+			if (dispatch_to_cpu(cpu))
+				*gimme = false;
+		}
+
+		/*
+		 * Retry if we ran out of dispatch buffer slots as we might have
+		 * skipped some CPUs and also need to dispatch for self. The ext
+		 * core automatically retries if the local dsq is empty but we
+		 * can't rely on that as we're dispatching for other CPUs too.
+		 * Kick self explicitly to retry.
+		 */
+		if (!scx_bpf_dispatch_nr_slots()) {
+			__sync_fetch_and_add(&nr_retries, 1);
+			scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+			return;
+		}
+
+		/* look for a task to run on the central CPU */
+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+			return;
+		dispatch_to_cpu(central_cpu);
+	} else {
+		bool *gimme;
+
+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+			return;
+
+		gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+		if (gimme)
+			*gimme = true;
+
+		/*
+		 * Force dispatch on the scheduling CPU so that it finds a task
+		 * to run for us.
+		 */
+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+	}
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(central_init)
+{
+	if (!switch_partial)
+		scx_bpf_switch_all();
+
+	return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+}
+
+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops central_ops = {
+	/*
+	 * We are offloading all scheduling decisions to the central CPU and
+	 * thus being the last task on a given CPU doesn't mean anything
+	 * special. Enqueue the last tasks like any other tasks.
+	 */
+	.flags			= SCX_OPS_ENQ_LAST,
+
+	.select_cpu		= (void *)central_select_cpu,
+	.enqueue		= (void *)central_enqueue,
+	.dispatch		= (void *)central_dispatch,
+	.init			= (void *)central_init,
+	.exit			= (void *)central_exit,
+	.name			= "central",
+};
diff --git a/tools/sched_ext/scx_example_central.c b/tools/sched_ext/scx_example_central.c
new file mode 100644
index 0000000000000..959b305a93a98
--- /dev/null
+++ b/tools/sched_ext/scx_example_central.c
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_central.skel.h"
+
+const char help_fmt[] =
+"A central FIFO sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-c CPU] [-p]\n"
+"\n"
+"  -c CPU        Override the central CPU (default: 0)\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_central *skel;
+	struct bpf_link *link;
+	u64 seq = 0;
+	s32 opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_central__open();
+	assert(skel);
+
+	skel->rodata->central_cpu = 0;
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+
+	while ((opt = getopt(argc, argv, "c:ph")) != -1) {
+		switch (opt) {
+		case 'c':
+			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
+			break;
+		case 'p':
+			skel->rodata->switch_partial = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	assert(!scx_example_central__load(skel));
+
+	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		printf("[SEQ %lu]\n", seq++);
+		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_locals,
+		       skel->bss->nr_queued,
+		       skel->bss->nr_lost_pids);
+		printf("                    dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+		       skel->bss->nr_dispatches,
+		       skel->bss->nr_mismatches,
+		       skel->bss->nr_retries);
+		printf("overflow:%10lu\n",
+		       skel->bss->nr_overflows);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_central__destroy(skel);
+	return 0;
+}

From ea1d8052c56ebca6824b695fd6b18eb961a7c7b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 020/304] sched_ext: Make watchdog handle ops.dispatch()
 looping stall

The dispatch path retries if the local DSQ is still empty after
ops.dispatch() either dispatched or consumed a task. This is both out of
necessity and for convenience. It has to retry because the dispatch path
might lose the tasks to dequeue while the rq lock is released while trying
to migrate tasks across CPUs, and the retry mechanism makes ops.dispatch()
implementation easier as it only needs to make some forward progress each
iteration.

However, this makes it possible for ops.dispatch() to stall CPUs by
repeatedly dispatching ineligible tasks. If all CPUs are stalled that way,
the watchdog or sysrq handler can't run and the system can't be saved. Let's
address the issue by breaking out of the dispatch loop after 32 iterations.

It is unlikely but not impossible for ops.dispatch() to legitimately go over
the iteration limit. We want to come back to the dispatch path in such cases
as not doing so risks stalling the CPU by idling with runnable tasks
pending. As the previous task is still current in balance_scx(),
resched_curr() doesn't do anything - it will just get cleared. Let's instead
use scx_kick_bpf() which will trigger reschedule after switching to the next
task which will likely be the idle task.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 kernel/sched/ext.c                     | 17 +++++++++++++++++
 tools/sched_ext/scx_example_qmap.bpf.c | 17 +++++++++++++++++
 tools/sched_ext/scx_example_qmap.c     |  8 ++++++--
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 93f59337cfd65..3d4a182d0ee11 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,7 @@
 enum scx_internal_consts {
 	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
 	SCX_DSP_DFL_MAX_BATCH	= 32,
+	SCX_DSP_MAX_LOOPS	= 32,
 	SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
 };
 
@@ -168,6 +169,7 @@ static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
 
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 		      u64 enq_flags);
+void scx_bpf_kick_cpu(s32 cpu, u64 flags);
 
 struct scx_task_iter {
 	struct sched_ext_entity		cursor;
@@ -1287,6 +1289,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 	struct scx_rq *scx_rq = &rq->scx;
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	int nr_loops = SCX_DSP_MAX_LOOPS;
 
 	lockdep_assert_rq_held(rq);
 
@@ -1341,6 +1344,20 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 			return 1;
 		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
 			return 1;
+
+		/*
+		 * ops.dispatch() can trap us in this loop by repeatedly
+		 * dispatching ineligible tasks. Break out once in a while to
+		 * allow the watchdog to run. As IRQ can't be enabled in
+		 * balance(), we want to complete this scheduling cycle and then
+		 * start a new one. IOW, we want to call resched_curr() on the
+		 * next, most likely idle, task, not the current one. Use
+		 * scx_bpf_kick_cpu() for deferred kicking.
+		 */
+		if (unlikely(!--nr_loops)) {
+			scx_bpf_kick_cpu(cpu_of(rq), 0);
+			break;
+		}
 	} while (dspc->nr_tasks);
 
 	return 0;
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index abb134fb18cea..ed704a4024c0d 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -28,6 +28,7 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile bool switch_partial;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
+const volatile u32 dsp_inf_loop_after;
 const volatile s32 disallow_tgid;
 
 u32 test_error_cnt;
@@ -187,6 +188,22 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	s32 pid;
 	int i;
 
+	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+		struct task_struct *p;
+
+		/*
+		 * PID 2 should be kthreadd which should mostly be idle and off
+		 * the scheduler. Let's keep dispatching it to force the kernel
+		 * to call this function over and over again.
+		 */
+		p = bpf_task_from_pid(2);
+		if (p) {
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
+			bpf_task_release(p);
+			return;
+		}
+	}
+
 	if (!idx || !cnt) {
 		scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt);
 		return;
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index f94fd39c4ed81..3f68dae47bd06 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,12 +20,13 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID] [-p]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID] [-p]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
+"  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";
@@ -51,7 +52,7 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:d:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -65,6 +66,9 @@ int main(int argc, char **argv)
 		case 'T':
 			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
 			break;
+		case 'l':
+			skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)

From f5d64586458a7dcdfe1907b0e618cb99cd2335a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 021/304] sched_ext: Add task state tracking operations

Being able to track the task runnable and running state transitions are
useful for a variety of purposes including latency tracking and load factor
calculation.

Currently, BPF schedulers don't have a good way of tracking these
transitions. Becoming runnable can be determined from ops.enqueue() but
becoming quiescent can only be inferred from the lack of subsequent enqueue.
Also, as the local dsq can have multiple tasks and some events are handled
in the sched_ext core, it's difficult to determine when a given task starts
and stops executing.

This patch adds sched_ext_ops.runnable(), .running(), .stopping() and
.quiescent() operations to track the task runnable and running state
transitions. They're mostly self explanatory; however, we want to ensure
that running <-> stopping transitions are always contained within runnable
<-> quiescent transitions which is a bit different from how the scheduler
core behaves. This adds a bit of complication. See the comment in
dequeue_task_scx().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 65 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/ext.c        | 31 +++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index ac19b720374d4..16eb54635e350 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -192,6 +192,71 @@ struct sched_ext_ops {
 	 */
 	void (*dispatch)(s32 cpu, struct task_struct *prev);
 
+	/**
+	 * runnable - A task is becoming runnable on its associated CPU
+	 * @p: task becoming runnable
+	 * @enq_flags: %SCX_ENQ_*
+	 *
+	 * This and the following three functions can be used to track a task's
+	 * execution state transitions. A task becomes ->runnable() on a CPU,
+	 * and then goes through one or more ->running() and ->stopping() pairs
+	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+	 * done running on the CPU.
+	 *
+	 * @p is becoming runnable on the CPU because it's
+	 *
+	 * - waking up (%SCX_ENQ_WAKEUP)
+	 * - being moved from another CPU
+	 * - being restored after temporarily taken off the queue for an
+	 *   attribute change.
+	 *
+	 * This and ->enqueue() are related but not coupled. This operation
+	 * notifies @p's state transition and may not be followed by ->enqueue()
+	 * e.g. when @p is being dispatched to a remote CPU. Likewise, a task
+	 * may be ->enqueue()'d without being preceded by this operation e.g.
+	 * after exhausting its slice.
+	 */
+	void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+	/**
+	 * running - A task is starting to run on its associated CPU
+	 * @p: task starting to run
+	 *
+	 * See ->runnable() for explanation on the task state notifiers.
+	 */
+	void (*running)(struct task_struct *p);
+
+	/**
+	 * stopping - A task is stopping execution
+	 * @p: task stopping to run
+	 * @runnable: is task @p still runnable?
+	 *
+	 * See ->runnable() for explanation on the task state notifiers. If
+	 * !@runnable, ->quiescent() will be invoked after this operation
+	 * returns.
+	 */
+	void (*stopping)(struct task_struct *p, bool runnable);
+
+	/**
+	 * quiescent - A task is becoming not runnable on its associated CPU
+	 * @p: task becoming not runnable
+	 * @deq_flags: %SCX_DEQ_*
+	 *
+	 * See ->runnable() for explanation on the task state notifiers.
+	 *
+	 * @p is becoming quiescent on the CPU because it's
+	 *
+	 * - sleeping (%SCX_DEQ_SLEEP)
+	 * - being moved to another CPU
+	 * - being temporarily taken off the queue for an attribute change
+	 *   (%SCX_DEQ_SAVE)
+	 *
+	 * This and ->dequeue() are related but not coupled. This operation
+	 * notifies @p's state transition and may not be preceded by ->dequeue()
+	 * e.g. when @p is being dispatched to a remote CPU.
+	 */
+	void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
 	/**
 	 * yield - Yield CPU
 	 * @from: yielding task
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3d4a182d0ee11..dda9dbba896d9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -776,6 +776,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
 
+	if (SCX_HAS_OP(runnable))
+		SCX_CALL_OP(SCX_KF_REST, runnable, p, enq_flags);
+
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 }
 
@@ -836,6 +839,26 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 
 	ops_dequeue(p, deq_flags);
 
+	/*
+	 * A currently running task which is going off @rq first gets dequeued
+	 * and then stops running. As we want running <-> stopping transitions
+	 * to be contained within runnable <-> quiescent transitions, trigger
+	 * ->stopping() early here instead of in put_prev_task_scx().
+	 *
+	 * @p may go through multiple stopping <-> running transitions between
+	 * here and put_prev_task_scx() if task attribute changes occur while
+	 * balance_scx() leaves @rq unlocked. However, they don't contain any
+	 * information meaningful to the BPF scheduler and can be suppressed by
+	 * skipping the callbacks if the task is !QUEUED.
+	 */
+	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+		update_curr_scx(rq);
+		SCX_CALL_OP(SCX_KF_REST, stopping, p, false);
+	}
+
+	if (SCX_HAS_OP(quiescent))
+		SCX_CALL_OP(SCX_KF_REST, quiescent, p, deq_flags);
+
 	if (deq_flags & SCX_DEQ_SLEEP)
 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
 	else
@@ -1372,6 +1395,10 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	p->se.exec_start = rq_clock_task(rq);
 
+	/* see dequeue_task_scx() on why we skip when !QUEUED */
+	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
+		SCX_CALL_OP(SCX_KF_REST, running, p);
+
 	watchdog_unwatch_task(p, true);
 }
 
@@ -1410,6 +1437,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 
 	update_curr_scx(rq);
 
+	/* see dequeue_task_scx() on why we skip when !QUEUED */
+	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+		SCX_CALL_OP(SCX_KF_REST, stopping, p, true);
+
 	/*
 	 * If we're being called from put_prev_task_balance(), balance_scx() may
 	 * have decided that @p should keep running.

From ee794ea6051be173537824db9c2ee22bf70d9994 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 022/304] sched_ext: Implement tickless support

Allow BPF schedulers to indicate tickless operation by setting p->scx.slice
to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into
tickless operation.

scx_example_central is updated to use tickless operations for all tasks and
instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT
and task state tracking added by the previous patches.

Currently, there is no way to pin the timer on the central CPU, so it may
end up on one of the worker CPUs; however, outside of that, the worker CPUs
can go tickless both while running sched_ext tasks and idling.

With schbench running, scx_example_central shows:

  root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     142024        656        664        449   Local timer interrupts
  LOC:     161663        663        665        449   Local timer interrupts

Without it:

  root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     188778       3142       3793       3993   Local timer interrupts
  LOC:     198993       5314       6323       6438   Local timer interrupts

While scx_example_central itself is too barebone to be useful as a
production scheduler, a more featureful central scheduler can be built using
the same approach. Google's experience shows that such an approach can have
significant benefits for certain applications such as VM hosting.

v2: * Convert to BPF inline iterators.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h                 |   1 +
 kernel/sched/core.c                       |   9 +-
 kernel/sched/ext.c                        |  43 +++++++-
 kernel/sched/ext.h                        |   2 +
 kernel/sched/sched.h                      |   6 ++
 tools/sched_ext/scx_example_central.bpf.c | 121 ++++++++++++++++++++--
 tools/sched_ext/scx_example_central.c     |   3 +-
 7 files changed, 173 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 16eb54635e350..2f2ee3e05904a 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -19,6 +19,7 @@ enum scx_consts {
 	SCX_EXIT_MSG_LEN	= 1024,
 
 	SCX_SLICE_DFL		= 20 * NSEC_PER_MSEC,
+	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
 };
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc90327f950df..a7e0725c24697 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1202,13 +1202,16 @@ bool sched_can_stop_tick(struct rq *rq)
 		return true;
 
 	/*
-	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
-	 * if there's more than one we need the tick for involuntary
-	 * preemption.
+	 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+	 * left. For CFS, if there's more than one we need the tick for
+	 * involuntary preemption. For SCX, ask.
 	 */
 	if (!scx_switched_all() && rq->nr_running > 1)
 		return false;
 
+	if (scx_enabled() && !scx_can_stop_tick(rq))
+		return false;
+
 	return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dda9dbba896d9..8b808952a6cbf 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -478,7 +478,8 @@ static void update_curr_scx(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
 
-	curr->scx.slice -= min(curr->scx.slice, delta_exec);
+	if (curr->scx.slice != SCX_SLICE_INF)
+		curr->scx.slice -= min(curr->scx.slice, delta_exec);
 }
 
 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -1400,6 +1401,20 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 		SCX_CALL_OP(SCX_KF_REST, running, p);
 
 	watchdog_unwatch_task(p, true);
+
+	/*
+	 * @p is getting newly scheduled or got kicked after someone updated its
+	 * slice. Refresh whether tick can be stopped. See can_stop_tick_scx().
+	 */
+	if ((p->scx.slice == SCX_SLICE_INF) !=
+	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+		if (p->scx.slice == SCX_SLICE_INF)
+			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+		else
+			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+		sched_update_tick_dependency(rq);
+	}
 }
 
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1940,6 +1955,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
 	return 0;
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	if (scx_ops_disabling())
+		return false;
+
+	if (p->sched_class != &ext_sched_class)
+		return true;
+
+	/*
+	 * @rq can dispatch from different DSQs, so we can't tell whether it
+	 * needs the tick or not by looking at nr_running. Allow stopping ticks
+	 * iff the BPF scheduler indicated so. See set_next_task_scx().
+	 */
+	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
 /*
  * Omitted operations:
  *
@@ -2100,7 +2135,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
 	const char *reason;
-	int i, type;
+	int i, cpu, type;
 
 	type = atomic_read(&scx_exit_type);
 	while (true) {
@@ -2197,6 +2232,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_task_iter_exit(&sti);
 	spin_unlock_irq(&scx_tasks_lock);
 
+	/* kick all CPUs to restore ticks */
+	for_each_possible_cpu(cpu)
+		resched_cpu(cpu);
+
 forward_progress_guaranteed:
 	/*
 	 * Here, every runnable task is guaranteed to make forward progress and
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 0b04626e8ca20..9c9284f91e388 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -82,6 +82,7 @@ int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
 int scx_check_setscheduler(struct task_struct *p, int policy);
+bool scx_can_stop_tick(struct rq *rq);
 void init_sched_ext_class(void);
 
 __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -141,6 +142,7 @@ static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
 static inline int scx_check_setscheduler(struct task_struct *p,
 					 int policy) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline void init_sched_ext_class(void) {}
 static inline void scx_notify_sched_tick(void) {}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cbdfc7b612258..e6dacf488a20a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -686,12 +686,18 @@ struct cfs_rq {
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+	SCX_RQ_CAN_STOP_TICK	= 1 << 0,
+};
+
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
 	struct list_head	watchdog_list;
 	u64			ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
+	u32			flags;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
 	struct irq_work		kick_cpus_irq_work;
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
index 443504fa68f84..4cec04b4c2ede 100644
--- a/tools/sched_ext/scx_example_central.bpf.c
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -13,7 +13,26 @@
  *    through per-CPU BPF queues. The current design is chosen to maximally
  *    utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
  *
- * b. Preemption
+ * b. Tickless operation
+ *
+ *    All tasks are dispatched with the infinite slice which allows stopping the
+ *    ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ *    parameter. The tickless operation can be observed through
+ *    /proc/interrupts.
+ *
+ *    Periodic switching is enforced by a periodic timer checking all CPUs and
+ *    preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ *    have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ *    the central CPU.
+ *
+ * c. Preemption
+ *
+ *    Kthreads are unconditionally queued to the head of a matching local dsq
+ *    and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ *    prioritized over user threads, which is required for ensuring forward
+ *    progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ *    ksoftirqd gets starved by a user thread, there may not be anything else to
+ *    vacate that user thread.
  *
  *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
  *    next tasks.
@@ -42,7 +61,7 @@ const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids = 64;	/* !0 for veristat, set during init */
 
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-u64 nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
 u64 nr_overflows;
 
 struct user_exit_info uei;
@@ -55,6 +74,7 @@ struct {
 
 /* can't use percpu map due to bad lookups */
 static bool cpu_gimme_task[MAX_CPUS];
+static u64 cpu_started_at[MAX_CPUS];
 
 struct central_timer {
 	struct bpf_timer timer;
@@ -67,6 +87,11 @@ struct {
 	__type(value, struct central_timer);
 } central_timer SEC(".maps");
 
+static bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
 s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -85,9 +110,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
 
 	__sync_fetch_and_add(&nr_total, 1);
 
+	/*
+	 * Push per-cpu kthreads at the head of local dsq's and preempt the
+	 * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+	 * behind other threads which is necessary for forward progress
+	 * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+	 */
+	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+		__sync_fetch_and_add(&nr_locals, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+				 enq_flags | SCX_ENQ_PREEMPT);
+		return;
+	}
+
 	if (bpf_map_push_elem(&central_q, &pid, 0)) {
 		__sync_fetch_and_add(&nr_overflows, 1);
-		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
 		return;
 	}
 
@@ -120,13 +158,13 @@ static bool dispatch_to_cpu(s32 cpu)
 		 */
 		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 			__sync_fetch_and_add(&nr_mismatches, 1);
-			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
 			continue;
 		}
 
 		/* dispatch to local and mark that @cpu doesn't need more */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
 
 		if (cpu != central_cpu)
 			scx_bpf_kick_cpu(cpu, 0);
@@ -194,12 +232,81 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	if (started_at)
+		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	if (started_at)
+		*started_at = 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	u64 now = bpf_ktime_get_ns();
+	u64 nr_to_kick = nr_queued;
+	s32 i;
+
+	bpf_for(i, 0, nr_cpu_ids) {
+		s32 cpu = (nr_timers + i) % nr_cpu_ids;
+		u64 *started_at;
+
+		if (cpu == central_cpu)
+			continue;
+
+		/* kick iff the current one exhausted its slice */
+		started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+		if (started_at && *started_at &&
+		    vtime_before(now, *started_at + SCX_SLICE_DFL))
+			continue;
+
+		/* and there's something pending */
+		if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+		    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+			;
+		else if (nr_to_kick)
+			nr_to_kick--;
+		else
+			continue;
+
+		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+	}
+
+	scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+
+	bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	__sync_fetch_and_add(&nr_timers, 1);
+	return 0;
+}
+
 int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 {
+	u32 key = 0;
+	struct bpf_timer *timer;
+	int ret;
+
 	if (!switch_partial)
 		scx_bpf_switch_all();
 
-	return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+	if (ret)
+		return ret;
+
+	timer = bpf_map_lookup_elem(&central_timer, &key);
+	if (!timer)
+		return -ESRCH;
+
+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(timer, central_timerfn);
+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	return ret;
 }
 
 void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
@@ -219,6 +326,8 @@ struct sched_ext_ops central_ops = {
 	.select_cpu		= (void *)central_select_cpu,
 	.enqueue		= (void *)central_enqueue,
 	.dispatch		= (void *)central_dispatch,
+	.running		= (void *)central_running,
+	.stopping		= (void *)central_stopping,
 	.init			= (void *)central_init,
 	.exit			= (void *)central_exit,
 	.name			= "central",
diff --git a/tools/sched_ext/scx_example_central.c b/tools/sched_ext/scx_example_central.c
index 959b305a93a98..7ad591cbdc65c 100644
--- a/tools/sched_ext/scx_example_central.c
+++ b/tools/sched_ext/scx_example_central.c
@@ -76,7 +76,8 @@ int main(int argc, char **argv)
 		       skel->bss->nr_locals,
 		       skel->bss->nr_queued,
 		       skel->bss->nr_lost_pids);
-		printf("                    dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+		printf("timer   :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+		       skel->bss->nr_timers,
 		       skel->bss->nr_dispatches,
 		       skel->bss->nr_mismatches,
 		       skel->bss->nr_retries);

From 9ab6c453d27cb38931d082dd1c3b853fb53ca87e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 023/304] sched_ext: Track tasks that are subjects of the
 in-flight SCX operation

When some SCX operations are in flight, it is known that the subject task's
rq lock is held throughout which makes it safe to access certain fields of
the task - e.g. its current task_group. We want to add SCX kfunc helpers
that can make use of this guarantee - e.g. to help determining the currently
associated CPU cgroup from the task's current task_group.

As it'd be dangerous call such a helper on a task which isn't rq lock
protected, the helper should be able to verify the input task and reject
accordingly. This patch adds sched_ext_entity.kf_tasks[] that track the
tasks which are currently being operated on by a terminal SCX operation. The
new SCX_CALL_OP_[2]TASK[_RET]() can be used when invoking SCX operations
which take tasks as arguments and the scx_kf_allowed_on_arg_tasks() can be
used by kfunc helpers to verify the input task status.

Note that as sched_ext_entity.kf_tasks[] can't handle nesting, the tracking
is currently only limited to terminal SCX operations. If needed in the
future, this restriction can be removed by moving the tracking to the task
side with a couple per-task counters.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 include/linux/sched/ext.h |  2 +
 kernel/sched/ext.c        | 91 +++++++++++++++++++++++++++++++--------
 2 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 2f2ee3e05904a..1ed07b4bdb245 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -449,6 +449,7 @@ enum scx_kf_mask {
 	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
 
 	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_REST,
+	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_REST,
 };
 
 /*
@@ -464,6 +465,7 @@ struct sched_ext_entity {
 	s32			sticky_cpu;
 	s32			holding_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
+	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 	atomic64_t		ops_state;
 	unsigned long		runnable_at;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8b808952a6cbf..47906b05626bb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -239,6 +239,47 @@ do {										\
 	__ret;									\
 })
 
+/*
+ * Some kfuncs are allowed only on the tasks that are subjects of the
+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
+ * restrictions, the following SCX_CALL_OP_*() variants should be used when
+ * invoking scx_ops operations that take task arguments. These can only be used
+ * for non-nesting operations due to the way the tasks are tracked.
+ *
+ * kfuncs which can only operate on such tasks can in turn use
+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
+ * the specific task.
+ */
+#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
+do {										\
+	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
+	current->scx.kf_tasks[0] = task;					\
+	SCX_CALL_OP(mask, op, task, ##args);					\
+	current->scx.kf_tasks[0] = NULL;					\
+} while (0)
+
+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
+({										\
+	__typeof__(scx_ops.op(task, ##args)) __ret;				\
+	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
+	current->scx.kf_tasks[0] = task;					\
+	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
+	current->scx.kf_tasks[0] = NULL;					\
+	__ret;									\
+})
+
+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
+({										\
+	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
+	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
+	current->scx.kf_tasks[0] = task0;					\
+	current->scx.kf_tasks[1] = task1;					\
+	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
+	current->scx.kf_tasks[0] = NULL;					\
+	current->scx.kf_tasks[1] = NULL;					\
+	__ret;									\
+})
+
 /* @mask is constant, always inline to cull unnecessary branches */
 static __always_inline bool scx_kf_allowed(u32 mask)
 {
@@ -269,6 +310,22 @@ static __always_inline bool scx_kf_allowed(u32 mask)
 	return true;
 }
 
+/* see SCX_CALL_OP_TASK() */
+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+							struct task_struct *p)
+{
+	if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED))
+		return false;
+
+	if (unlikely((p != current->scx.kf_tasks[0] &&
+		      p != current->scx.kf_tasks[1]))) {
+		scx_ops_error("called on a task not being operated on");
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * scx_task_iter_init - Initialize a task iterator
  * @iter: iterator to init
@@ -706,7 +763,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	WARN_ON_ONCE(*ddsp_taskp);
 	*ddsp_taskp = p;
 
-	SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
 
 	/*
 	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
@@ -778,7 +835,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	add_nr_running(rq, 1);
 
 	if (SCX_HAS_OP(runnable))
-		SCX_CALL_OP(SCX_KF_REST, runnable, p, enq_flags);
+		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
 
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 }
@@ -803,7 +860,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 		BUG();
 	case SCX_OPSS_QUEUED:
 		if (SCX_HAS_OP(dequeue))
-			SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags);
+			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
 
 		if (atomic64_try_cmpxchg(&p->scx.ops_state, &opss,
 					 SCX_OPSS_NONE))
@@ -854,11 +911,11 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	 */
 	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
 		update_curr_scx(rq);
-		SCX_CALL_OP(SCX_KF_REST, stopping, p, false);
+		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
 	}
 
 	if (SCX_HAS_OP(quiescent))
-		SCX_CALL_OP(SCX_KF_REST, quiescent, p, deq_flags);
+		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)
 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -877,7 +934,7 @@ static void yield_task_scx(struct rq *rq)
 	struct task_struct *p = rq->curr;
 
 	if (SCX_HAS_OP(yield))
-		SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL);
+		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
 	else
 		p->scx.slice = 0;
 }
@@ -887,7 +944,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 	struct task_struct *from = rq->curr;
 
 	if (SCX_HAS_OP(yield))
-		return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to);
+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
 	else
 		return false;
 }
@@ -1398,7 +1455,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP(SCX_KF_REST, running, p);
+		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
 
 	watchdog_unwatch_task(p, true);
 
@@ -1454,7 +1511,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP(SCX_KF_REST, stopping, p, true);
+		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
 
 	/*
 	 * If we're being called from put_prev_task_balance(), balance_scx() may
@@ -1617,8 +1674,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 	if (SCX_HAS_OP(select_cpu)) {
 		s32 cpu;
 
-		cpu = SCX_CALL_OP_RET(SCX_KF_REST, select_cpu, p, prev_cpu,
-				      wake_flags);
+		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_REST, select_cpu, p, prev_cpu,
+					   wake_flags);
 		if (ops_cpu_valid(cpu)) {
 			return cpu;
 		} else {
@@ -1644,8 +1701,8 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 	 * designation pointless. Cast it away when calling the operation.
 	 */
 	if (SCX_HAS_OP(set_cpumask))
-		SCX_CALL_OP(SCX_KF_REST, set_cpumask, p,
-			    (struct cpumask *)p->cpus_ptr);
+		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+				 (struct cpumask *)p->cpus_ptr);
 }
 
 static void reset_idle_masks(void)
@@ -1806,7 +1863,7 @@ static void scx_ops_enable_task(struct task_struct *p)
 
 	if (SCX_HAS_OP(enable)) {
 		struct scx_enable_args args = { };
-		SCX_CALL_OP(SCX_KF_REST, enable, p, &args);
+		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args);
 	}
 	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
 	p->scx.flags |= SCX_TASK_OPS_ENABLED;
@@ -1845,7 +1902,7 @@ static void refresh_scx_weight(struct task_struct *p)
 
 	p->scx.weight = sched_weight_to_cgroup(weight);
 	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
 }
 
 void scx_pre_fork(struct task_struct *p)
@@ -1936,8 +1993,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 	 * different scheduler class. Keep the BPF scheduler up-to-date.
 	 */
 	if (SCX_HAS_OP(set_cpumask))
-		SCX_CALL_OP(SCX_KF_REST, set_cpumask, p,
-			    (struct cpumask *)p->cpus_ptr);
+		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+				 (struct cpumask *)p->cpus_ptr);
 }
 
 static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}

From 2441eb1484a79afa1bacaa0f4a16cbb185be735b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 024/304] sched_ext: Add cgroup support

Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. Because different BPF schedulers may implement different
subsets of CPU control features, allow BPF schedulers to pick which cgroup
interface files to enable using SCX_OPS_CGROUP_KNOB_* flags. For now, only
the weight knobs are supported but adding more should be straightforward.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

This patch also adds scx_example_pair which implements a variant of core
scheduling where a hyperthread pair only run tasks from the same cgroup. The
BPF scheduler achieves this by putting tasks into per-cgroup queues,
time-slicing the cgroup to run for each pair first, and then scheduling
within the cgroup. See the header comment in scx_example_pair.bpf.c for more
details.

Note that scx_example_pair's cgroup-boundary guarantee breaks down for tasks
running in higher priority scheduler classes. This will be addressed by a
followup patch which implements a mechanism to track CPU preemption.

v3: * Make scx_example_pair switch all tasks by default.

    * Convert to BPF inline iterators.

    * scx_bpf_task_cgroup() is added to determine the current cgroup from
      CPU controller's POV. This allows BPF schedulers to accurately track
      CPU cgroup membership.

    * scx_exampl_flatcg added. This demonstrates flattened hierarchy
      implementation of CPU cgroup control and shows significant performance
      improvement when cgroups which are nested multiple levels are under
      competition.

v2: * Build fixes for different CONFIG combinations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
---
 include/linux/sched/ext.h                | 100 ++-
 init/Kconfig                             |   5 +
 kernel/sched/core.c                      |  70 +-
 kernel/sched/ext.c                       | 391 ++++++++++-
 kernel/sched/ext.h                       |  25 +
 kernel/sched/sched.h                     |  12 +-
 tools/sched_ext/.gitignore               |   2 +
 tools/sched_ext/Makefile                 |  14 +-
 tools/sched_ext/scx_common.bpf.h         |   1 +
 tools/sched_ext/scx_example_flatcg.bpf.c | 824 +++++++++++++++++++++++
 tools/sched_ext/scx_example_flatcg.c     | 228 +++++++
 tools/sched_ext/scx_example_flatcg.h     |  49 ++
 tools/sched_ext/scx_example_pair.bpf.c   | 536 +++++++++++++++
 tools/sched_ext/scx_example_pair.c       | 143 ++++
 tools/sched_ext/scx_example_pair.h       |  10 +
 15 files changed, 2386 insertions(+), 24 deletions(-)
 create mode 100644 tools/sched_ext/scx_example_flatcg.bpf.c
 create mode 100644 tools/sched_ext/scx_example_flatcg.c
 create mode 100644 tools/sched_ext/scx_example_flatcg.h
 create mode 100644 tools/sched_ext/scx_example_pair.bpf.c
 create mode 100644 tools/sched_ext/scx_example_pair.c
 create mode 100644 tools/sched_ext/scx_example_pair.h

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1ed07b4bdb245..9e47e320369d7 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -12,6 +12,8 @@
 #include <linux/rhashtable.h>
 #include <linux/llist.h>
 
+struct cgroup;
+
 enum scx_consts {
 	SCX_OPS_NAME_LEN	= 128,
 	SCX_EXIT_REASON_LEN	= 128,
@@ -108,14 +110,29 @@ enum scx_ops_flags {
 	 */
 	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
 
+	/*
+	 * CPU cgroup knob enable flags
+	 */
+	SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16,	/* cpu.weight */
+
 	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
 				  SCX_OPS_ENQ_LAST |
-				  SCX_OPS_ENQ_EXITING,
+				  SCX_OPS_ENQ_EXITING |
+				  SCX_OPS_CGROUP_KNOB_WEIGHT,
 };
 
 /* argument container for ops.enable() and friends */
 struct scx_enable_args {
-	/* empty for now */
+#ifdef CONFIG_EXT_GROUP_SCHED
+	/* the cgroup the task is joining */
+	struct cgroup		*cgroup;
+#endif
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+	/* the weight of the cgroup [1..10000] */
+	u32			weight;
 };
 
 /**
@@ -333,7 +350,8 @@ struct sched_ext_ops {
 	 * @p: task to enable BPF scheduling for
 	 * @args: enable arguments, see the struct definition
 	 *
-	 * Enable @p for BPF scheduling. @p will start running soon.
+	 * Enable @p for BPF scheduling. @p is now in the cgroup specified for
+	 * the preceding prep_enable() and will start running soon.
 	 */
 	void (*enable)(struct task_struct *p, struct scx_enable_args *args);
 
@@ -357,6 +375,79 @@ struct sched_ext_ops {
 	 */
 	void (*disable)(struct task_struct *p);
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+	/**
+	 * cgroup_init - Initialize a cgroup
+	 * @cgrp: cgroup being initialized
+	 * @args: init arguments, see the struct definition
+	 *
+	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
+	 * @cgrp for sched_ext. This operation may block.
+	 *
+	 * Return 0 for success, -errno for failure. An error return while
+	 * loading will abort loading of the BPF scheduler. During cgroup
+	 * creation, it will abort the specific cgroup creation.
+	 */
+	s32 (*cgroup_init)(struct cgroup *cgrp,
+			   struct scx_cgroup_init_args *args);
+
+	/**
+	 * cgroup_exit - Exit a cgroup
+	 * @cgrp: cgroup being exited
+	 *
+	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+	 * @cgrp for sched_ext. This operation my block.
+	 */
+	void (*cgroup_exit)(struct cgroup *cgrp);
+
+	/**
+	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Prepare @p for move from cgroup @from to @to. This operation may
+	 * block and can be used for allocations.
+	 *
+	 * Return 0 for success, -errno for failure. An error return aborts the
+	 * migration.
+	 */
+	s32 (*cgroup_prep_move)(struct task_struct *p,
+				struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_move - Commit cgroup move
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Commit the move. @p is dequeued during this operation.
+	 */
+	void (*cgroup_move)(struct task_struct *p,
+			    struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_cancel_move - Cancel cgroup move
+	 * @p: task whose cgroup move is being canceled
+	 * @from: cgroup @p was being moved from
+	 * @to: cgroup @p was being moved to
+	 *
+	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+	 * Undo the preparation.
+	 */
+	void (*cgroup_cancel_move)(struct task_struct *p,
+				   struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_set_weight - A cgroup's weight is being changed
+	 * @cgrp: cgroup whose weight is being updated
+	 * @weight: new weight [1..10000]
+	 *
+	 * Update @tg's weight to @weight.
+	 */
+	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+#endif	/* CONFIG_CGROUPS */
+
 	/*
 	 * All online ops must come before ops.init().
 	 */
@@ -497,6 +588,9 @@ struct sched_ext_entity {
 
 	/* cold fields */
 	struct list_head	tasks_node;
+#ifdef CONFIG_EXT_GROUP_SCHED
+	struct cgroup		*cgrp_moving_from;
+#endif
 };
 
 void sched_ext_free(struct task_struct *p);
diff --git a/init/Kconfig b/init/Kconfig
index 1fb5f313d18f0..375e9c6557b63 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1039,6 +1039,11 @@ config RT_GROUP_SCHED
 	  realtime bandwidth for them.
 	  See Documentation/scheduler/sched-rt-group.rst for more information.
 
+config EXT_GROUP_SCHED
+	bool
+	depends on SCHED_CLASS_EXT && CGROUP_SCHED
+	default y
+
 endif #CGROUP_SCHED
 
 config SCHED_MM_CID
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7e0725c24697..0780414f3c156 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9859,6 +9859,9 @@ void __init sched_init(void)
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+		root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+#endif /* CONFIG_EXT_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
@@ -10315,6 +10318,7 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
 	alloc_uclamp_sched_group(tg, parent);
 
 	return tg;
@@ -10418,6 +10422,7 @@ void sched_move_task(struct task_struct *tsk)
 	SCHED_CHANGE_BLOCK(rq, tsk,
 			   DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
 		sched_change_group(tsk);
+		scx_move_task(tsk);
 	}
 
 	/*
@@ -10454,6 +10459,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
+	int ret;
+
+	ret = scx_tg_online(tg);
+	if (ret)
+		return ret;
 
 	if (parent)
 		sched_online_group(tg, parent);
@@ -10470,6 +10480,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	return 0;
 }
 
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	scx_tg_offline(tg);
+}
+
 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
@@ -10487,9 +10504,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 	sched_unregister_group(tg);
 }
 
-#ifdef CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
 
@@ -10497,7 +10515,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (!sched_rt_can_attach(css_tg(css), task))
 			return -EINVAL;
 	}
-	return 0;
+#endif
+	return scx_cgroup_can_attach(tset);
 }
 #endif
 
@@ -10508,8 +10527,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(task, css, tset)
 		sched_move_task(task);
+
+	scx_cgroup_finish_attach();
 }
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+	scx_cgroup_cancel_attach(tset);
+}
+#endif
+
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 static void cpu_util_update_eff(struct cgroup_subsys_state *css)
 {
@@ -10691,9 +10719,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
 {
+	int ret;
+
 	if (shareval > scale_load_down(ULONG_MAX))
 		shareval = MAX_SHARES;
-	return sched_group_set_shares(css_tg(css), scale_load(shareval));
+	ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+	if (!ret)
+		scx_group_set_weight(css_tg(css),
+				     sched_weight_to_cgroup(shareval));
+	return ret;
 }
 
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
@@ -11157,11 +11191,15 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 	return 0;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 
 static unsigned long tg_weight(struct task_group *tg)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	return scale_load_down(tg->shares);
+#else
+	return sched_weight_from_cgroup(tg->scx_weight);
+#endif
 }
 
 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
@@ -11174,13 +11212,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cft, u64 cgrp_weight)
 {
 	unsigned long weight;
+	int ret;
 
 	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
 		return -ERANGE;
 
 	weight = sched_weight_from_cgroup(cgrp_weight);
 
-	return sched_group_set_shares(css_tg(css), scale_load(weight));
+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+	if (!ret)
+		scx_group_set_weight(css_tg(css), cgrp_weight);
+	return ret;
 }
 
 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
@@ -11205,7 +11247,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 				     struct cftype *cft, s64 nice)
 {
 	unsigned long weight;
-	int idx;
+	int idx, ret;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
 		return -ERANGE;
@@ -11214,7 +11256,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 	idx = array_index_nospec(idx, 40);
 	weight = sched_prio_to_weight[idx];
 
-	return sched_group_set_shares(css_tg(css), scale_load(weight));
+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+	if (!ret)
+		scx_group_set_weight(css_tg(css),
+				     sched_weight_to_cgroup(weight));
+	return ret;
 }
 #endif
 
@@ -11276,7 +11322,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 #endif
 
 struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	[CPU_CFTYPE_WEIGHT] = {
 		.name = "weight",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -11289,6 +11335,8 @@ struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
 		.read_s64 = cpu_weight_nice_read_s64,
 		.write_s64 = cpu_weight_nice_write_s64,
 	},
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	[CPU_CFTYPE_IDLE] = {
 		.name = "idle",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -11330,13 +11378,17 @@ struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_online	= cpu_cgroup_css_online,
+	.css_offline	= cpu_cgroup_css_offline,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
 	.css_extra_stat_show = cpu_extra_stat_show,
-#ifdef CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	.can_attach	= cpu_cgroup_can_attach,
 #endif
 	.attach		= cpu_cgroup_attach,
+#ifdef CONFIG_EXT_GROUP_SCHED
+	.cancel_attach	= cpu_cgroup_cancel_attach,
+#endif
 	.legacy_cftypes	= cpu_legacy_cftypes,
 	.dfl_cftypes	= cpu_cftypes,
 	.early_init	= true,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 47906b05626bb..f1c9616acff46 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1813,6 +1813,28 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 		resched_curr(rq);
 }
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+static struct cgroup *tg_cgrp(struct task_group *tg)
+{
+	/*
+	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
+	 * root cgroup.
+	 */
+	if (tg && tg->css.cgroup)
+		return tg->css.cgroup;
+	else
+		return &cgrp_dfl_root.cgrp;
+}
+
+#define SCX_ENABLE_ARGS_INIT_CGROUP(tg)		.cgroup = tg_cgrp(tg),
+
+#else	/* CONFIG_EXT_GROUP_SCHED */
+
+#define SCX_ENABLE_ARGS_INIT_CGROUP(tg)
+
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
 static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 {
 	int ret;
@@ -1822,7 +1844,9 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 	p->scx.disallow = false;
 
 	if (SCX_HAS_OP(prep_enable)) {
-		struct scx_enable_args args = { };
+		struct scx_enable_args args = {
+			SCX_ENABLE_ARGS_INIT_CGROUP(tg)
+		};
 
 		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, prep_enable, p, &args);
 		if (unlikely(ret)) {
@@ -1862,7 +1886,9 @@ static void scx_ops_enable_task(struct task_struct *p)
 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
 
 	if (SCX_HAS_OP(enable)) {
-		struct scx_enable_args args = { };
+		struct scx_enable_args args = {
+			SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
+		};
 		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args);
 	}
 	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
@@ -1875,7 +1901,9 @@ static void scx_ops_disable_task(struct task_struct *p)
 
 	if (p->scx.flags & SCX_TASK_OPS_PREPPED) {
 		if (SCX_HAS_OP(cancel_enable)) {
-			struct scx_enable_args args = { };
+			struct scx_enable_args args = {
+				SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
+			};
 			SCX_CALL_OP(SCX_KF_REST, cancel_enable, p, &args);
 		}
 		p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
@@ -2032,6 +2060,166 @@ bool scx_can_stop_tick(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+
+int scx_tg_online(struct task_group *tg)
+{
+	int ret = 0;
+
+	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (SCX_HAS_OP(cgroup_init)) {
+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init,
+				      tg->css.cgroup, &args);
+		if (!ret)
+			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+		else
+			ret = ops_sanitize_err("cgroup_init", ret);
+	} else {
+		tg->scx_flags |= SCX_TG_ONLINE;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+	return ret;
+}
+
+void scx_tg_offline(struct task_group *tg)
+{
+	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup);
+	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *p;
+	int ret;
+
+	/* released in scx_finish/cancel_attach() */
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (!scx_enabled())
+		return 0;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		struct cgroup *from = tg_cgrp(task_group(p));
+
+		if (SCX_HAS_OP(cgroup_prep_move)) {
+			ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move,
+					      p, from, css->cgroup);
+			if (ret)
+				goto err;
+		}
+
+		WARN_ON_ONCE(p->scx.cgrp_moving_from);
+		p->scx.cgrp_moving_from = from;
+	}
+
+	return 0;
+
+err:
+	cgroup_taskset_for_each(p, css, tset) {
+		if (!p->scx.cgrp_moving_from)
+			break;
+		if (SCX_HAS_OP(cgroup_cancel_move))
+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p,
+				    p->scx.cgrp_moving_from, css->cgroup);
+		p->scx.cgrp_moving_from = NULL;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+	return ops_sanitize_err("cgroup_prep_move", ret);
+}
+
+void scx_move_task(struct task_struct *p)
+{
+	/*
+	 * We're called from sched_move_task() which handles both cgroup and
+	 * autogroup moves. Ignore the latter.
+	 */
+	if (task_group_is_autogroup(task_group(p)))
+		return;
+
+	if (!scx_enabled())
+		return;
+
+	if (SCX_HAS_OP(cgroup_move)) {
+		WARN_ON_ONCE(!p->scx.cgrp_moving_from);
+		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
+			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+	}
+	p->scx.cgrp_moving_from = NULL;
+}
+
+void scx_cgroup_finish_attach(void)
+{
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *p;
+
+	if (!scx_enabled())
+		goto out_unlock;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		if (SCX_HAS_OP(cgroup_cancel_move)) {
+			WARN_ON_ONCE(!p->scx.cgrp_moving_from);
+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p,
+				    p->scx.cgrp_moving_from, css->cgroup);
+		}
+		p->scx.cgrp_moving_from = NULL;
+	}
+out_unlock:
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+{
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (tg->scx_weight != weight) {
+		if (SCX_HAS_OP(cgroup_set_weight))
+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight,
+				    tg_cgrp(tg), weight);
+		tg->scx_weight = weight;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_lock(void)
+{
+	percpu_down_write(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_unlock(void)
+{
+	percpu_up_write(&scx_cgroup_rwsem);
+}
+
+#else	/* CONFIG_EXT_GROUP_SCHED */
+
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
+
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
 /*
  * Omitted operations:
  *
@@ -2161,6 +2349,131 @@ static void destroy_dsq(u64 dsq_id)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void scx_cgroup_exit(void)
+{
+	struct cgroup_subsys_state *css;
+
+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+	/*
+	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+	 * cgroups and exit all the inited ones, all online cgroups are exited.
+	 */
+	rcu_read_lock();
+	css_for_each_descendant_post(css, &root_task_group.css) {
+		struct task_group *tg = css_tg(css);
+
+		if (!(tg->scx_flags & SCX_TG_INITED))
+			continue;
+		tg->scx_flags &= ~SCX_TG_INITED;
+
+		if (!scx_ops.cgroup_exit)
+			continue;
+
+		if (WARN_ON_ONCE(!css_tryget(css)))
+			continue;
+		rcu_read_unlock();
+
+		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+
+		rcu_read_lock();
+		css_put(css);
+	}
+	rcu_read_unlock();
+}
+
+static int scx_cgroup_init(void)
+{
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+	/*
+	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+	 * cgroups and init, all online cgroups are initialized.
+	 */
+	rcu_read_lock();
+	css_for_each_descendant_pre(css, &root_task_group.css) {
+		struct task_group *tg = css_tg(css);
+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+		if ((tg->scx_flags &
+		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
+			continue;
+
+		if (!scx_ops.cgroup_init) {
+			tg->scx_flags |= SCX_TG_INITED;
+			continue;
+		}
+
+		if (WARN_ON_ONCE(!css_tryget(css)))
+			continue;
+		rcu_read_unlock();
+
+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init,
+				      css->cgroup, &args);
+		if (ret) {
+			css_put(css);
+			return ret;
+		}
+		tg->scx_flags |= SCX_TG_INITED;
+
+		rcu_read_lock();
+		css_put(css);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static void scx_cgroup_config_knobs(void)
+{
+	static DEFINE_MUTEX(cgintf_mutex);
+	DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { };
+	u64 knob_flags;
+	int i;
+
+	/*
+	 * Called from both class switch and ops enable/disable paths,
+	 * synchronize internally.
+	 */
+	mutex_lock(&cgintf_mutex);
+
+	/* if fair is in use, all knobs should be shown */
+	if (!scx_switched_all()) {
+		bitmap_fill(mask, CPU_CFTYPE_CNT);
+		goto apply;
+	}
+
+	/*
+	 * On ext, only show the supported knobs. Otherwise, show all possible
+	 * knobs so that configuration attempts succeed and the states are
+	 * remembered while ops is not loaded.
+	 */
+	if (scx_enabled())
+		knob_flags = scx_ops.flags;
+	else
+		knob_flags = SCX_OPS_ALL_FLAGS;
+
+	if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) {
+		__set_bit(CPU_CFTYPE_WEIGHT, mask);
+		__set_bit(CPU_CFTYPE_WEIGHT_NICE, mask);
+	}
+apply:
+	for (i = 0; i < CPU_CFTYPE_CNT; i++)
+		cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask));
+
+	mutex_unlock(&cgintf_mutex);
+}
+
+#else
+static void scx_cgroup_exit(void) {}
+static int scx_cgroup_init(void) { return 0; }
+static void scx_cgroup_config_knobs(void) {}
+#endif
+
 /*
  * Used by sched_fork() and __setscheduler_prio() to pick the matching
  * sched_class. dl/rt are already handled.
@@ -2304,9 +2617,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable(&__scx_switched_all);
 	WRITE_ONCE(scx_switching_all, false);
 
-	/* avoid racing against fork */
+	/* avoid racing against fork and cgroup changes */
 	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -2343,6 +2657,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	synchronize_rcu();
 
+	scx_cgroup_exit();
+
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
 
@@ -2381,6 +2698,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
 		     SCX_OPS_DISABLING);
+
+	scx_cgroup_config_knobs();
 }
 
 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
@@ -2526,10 +2845,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			   scx_watchdog_timeout / 2);
 
 	/*
-	 * Lock out forks before opening the floodgate so that they don't wander
-	 * into the operations prematurely.
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
 	 */
 	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
 
 	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
 		if (((void (**)(void))ops)[i])
@@ -2548,6 +2868,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	}
 
+	/*
+	 * All cgroups should be initialized before letting in tasks. cgroup
+	 * on/offlining and task migrations are already locked out.
+	 */
+	ret = scx_cgroup_init();
+	if (ret)
+		goto err_disable_unlock;
+
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
 	/*
@@ -2630,6 +2958,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
@@ -2643,6 +2972,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
+	scx_cgroup_config_knobs();
+
 	return 0;
 
 err_unlock:
@@ -2650,6 +2981,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	return ret;
 
 err_disable_unlock:
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 err_disable:
 	cpus_read_unlock();
@@ -2807,6 +3139,11 @@ static int bpf_scx_check_member(const struct btf_type *t,
 
 	switch (moff) {
 	case offsetof(struct sched_ext_ops, prep_enable):
+#ifdef CONFIG_EXT_GROUP_SCHED
+	case offsetof(struct sched_ext_ops, cgroup_init):
+	case offsetof(struct sched_ext_ops, cgroup_exit):
+	case offsetof(struct sched_ext_ops, cgroup_prep_move):
+#endif
 	case offsetof(struct sched_ext_ops, init):
 	case offsetof(struct sched_ext_ops, exit):
 		break;
@@ -2905,7 +3242,8 @@ void __init init_sched_ext_class(void)
 	 * definitions so that BPF scheduler implementations can use them
 	 * through the generated vmlinux.h.
 	 */
-	WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP);
+	WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP |
+		   SCX_TG_ONLINE);
 
 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
@@ -2926,6 +3264,7 @@ void __init init_sched_ext_class(void)
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
 	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
+	scx_cgroup_config_knobs();
 }
 
 
@@ -2969,8 +3308,8 @@ static const struct btf_kfunc_id_set scx_kfunc_set_init = {
  * @dsq_id: DSQ to create
  * @node: NUMA node to allocate from
  *
- * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and
- * ops.prep_enable().
+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(),
+ * ops.prep_enable(), ops.cgroup_init() and ops.cgroup_prep_move().
  */
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 {
@@ -3413,6 +3752,39 @@ s32 scx_bpf_task_cpu(const struct task_struct *p)
 	return task_cpu(p);
 }
 
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
+ * operations. The restriction guarantees that @p's rq is locked by the caller.
+ */
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
+{
+	struct task_group *tg = p->sched_task_group;
+	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+
+	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+		goto out;
+
+	/*
+	 * A task_group may either be a cgroup or an autogroup. In the latter
+	 * case, @tg->css.cgroup is %NULL. A task_group can't become the other
+	 * kind once created.
+	 */
+	if (tg && tg->css.cgroup)
+		cgrp = tg->css.cgroup;
+	else
+		cgrp = &cgrp_dfl_root.cgrp;
+out:
+	cgroup_get(cgrp);
+	return cgrp;
+}
+
 BTF_SET8_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
@@ -3425,6 +3797,7 @@ BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 BTF_SET8_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 9c9284f91e388..0c5a109e7e6d1 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -59,6 +59,11 @@ enum scx_deq_flags {
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
 };
 
+enum scx_tg_flags {
+	SCX_TG_ONLINE		= 1U << 0,
+	SCX_TG_INITED		= 1U << 1,
+};
+
 enum scx_kick_flags {
 	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
 };
@@ -162,3 +167,23 @@ static inline void scx_update_idle(struct rq *rq, bool idle)
 #else
 static inline void scx_update_idle(struct rq *rq, bool idle) {}
 #endif
+
+#ifdef CONFIG_CGROUP_SCHED
+#ifdef CONFIG_EXT_GROUP_SCHED
+int scx_tg_online(struct task_group *tg);
+void scx_tg_offline(struct task_group *tg);
+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
+void scx_move_task(struct task_struct *p);
+void scx_cgroup_finish_attach(void);
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
+#else	/* CONFIG_EXT_GROUP_SCHED */
+static inline int scx_tg_online(struct task_group *tg) { return 0; }
+static inline void scx_tg_offline(struct task_group *tg) {}
+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
+static inline void scx_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_finish_attach(void) {}
+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+#endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6dacf488a20a..3571cfda62a30 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -424,6 +424,11 @@ struct task_group {
 	struct rt_bandwidth	rt_bandwidth;
 #endif
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+	u32			scx_flags;	/* SCX_TG_* */
+	u32			scx_weight;
+#endif
+
 	struct rcu_head		rcu;
 	struct list_head	list;
 
@@ -528,6 +533,11 @@ extern void set_task_rq_fair(struct sched_entity *se,
 static inline void set_task_rq_fair(struct sched_entity *se,
 			     struct cfs_rq *prev, struct cfs_rq *next) { }
 #endif /* CONFIG_SMP */
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+	return 0;
+}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #else /* CONFIG_CGROUP_SCHED */
@@ -3418,7 +3428,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
 
 #ifdef CONFIG_CGROUP_SCHED
 enum cpu_cftype_id {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	CPU_CFTYPE_WEIGHT,
 	CPU_CFTYPE_WEIGHT_NICE,
 	CPU_CFTYPE_IDLE,
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index 3d8ec46ca304f..769bc6f35cc64 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -1,6 +1,8 @@
 scx_example_simple
 scx_example_qmap
 scx_example_central
+scx_example_pair
+scx_example_flatcg
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index bcec7c1fb7b19..8c7543bbff8d7 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -115,7 +115,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_example_simple scx_example_qmap scx_example_central
+all: scx_example_simple scx_example_qmap scx_example_central scx_example_pair	\
+     scx_example_flatcg
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -178,10 +179,19 @@ scx_example_central: scx_example_central.c scx_example_central.skel.h user_exit_
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+scx_example_pair: scx_example_pair.c scx_example_pair.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+scx_example_flatcg: scx_example_flatcg.c scx_example_flatcg.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
 clean:
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-	rm -f scx_example_simple scx_example_qmap scx_example_central
+	rm -f scx_example_simple scx_example_qmap scx_example_central		\
+	      scx_example_pair scx_example_flatcg
 
 .PHONY: all clean
 
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 647ec7a99bd01..b81e5a89e9221 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -67,6 +67,7 @@ void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
 
 #define BPF_STRUCT_OPS(name, args...)						\
 SEC("struct_ops/"#name)								\
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
new file mode 100644
index 0000000000000..9632bab7f164c
--- /dev/null
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -0,0 +1,824 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
+ * hierarchical weight-based cgroup CPU control by flattening the cgroup
+ * hierarchy into a single layer by compounding the active weight share at each
+ * level. Consider the following hierarchy with weights in parentheses:
+ *
+ * R + A (100) + B (100)
+ *   |         \ C (100)
+ *   \ D (200)
+ *
+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
+ * Let's say all three have runnable tasks. The total share that each of these
+ * three cgroups is entitled to can be calculated by compounding its share at
+ * each level.
+ *
+ * For example, B is competing against C and in that competition its share is
+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
+ * share in that competition is 200/(200+100) == 1/3. B's eventual share in the
+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
+ * eventual shaer is the same at 1/6. D is only competing at the top level and
+ * its share is 200/(100+200) == 2/3.
+ *
+ * So, instead of hierarchically scheduling level-by-level, we can consider it
+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
+ * and keep updating the eventual shares as the cgroups' runnable states change.
+ *
+ * This flattening of hierarchy can bring a substantial performance gain when
+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
+ * apache instances competing with 2:1 weight ratio nested four level deep.
+ *
+ * However, the gain comes at the cost of not being able to properly handle
+ * thundering herd of cgroups. For example, if many cgroups which are nested
+ * behind a low priority parent cgroup wake up around the same time, they may be
+ * able to consume more CPU cycles than they are entitled to. In many use cases,
+ * this isn't a real concern especially given the performance gain. Also, there
+ * are ways to mitigate the problem further by e.g. introducing an extra
+ * scheduling layer on cgroup delegation boundaries.
+ */
+#include "scx_common.bpf.h"
+#include "user_exit_info.h"
+#include "scx_example_flatcg.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
+const volatile bool switch_partial;
+
+u64 cvtime_now;
+struct user_exit_info uei;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, FCG_NR_STATS);
+} stats SEC(".maps");
+
+static void stat_inc(enum fcg_stat_idx idx)
+{
+	u32 idx_v = idx;
+
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
+	if (cnt_p)
+		(*cnt_p)++;
+}
+
+struct fcg_cpu_ctx {
+	u64			cur_cgid;
+	u64			cur_at;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, struct fcg_cpu_ctx);
+	__uint(max_entries, 1);
+} cpu_ctx SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct fcg_cgrp_ctx);
+} cgrp_ctx SEC(".maps");
+
+struct cgv_node {
+	struct bpf_rb_node	rb_node;
+	__u64			cvtime;
+	__u64			cgid;
+};
+
+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
+
+struct cgv_node_stash {
+	struct cgv_node __kptr *node;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 16384);
+	__type(key, __u64);
+	__type(value, struct cgv_node_stash);
+} cgv_node_stash SEC(".maps");
+
+struct fcg_task_ctx {
+	u64		bypassed_at;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct fcg_task_ctx);
+} task_ctx SEC(".maps");
+
+/* gets inc'd on weight tree changes to expire the cached hweights */
+unsigned long hweight_gen = 1;
+
+static u64 div_round_up(u64 dividend, u64 divisor)
+{
+	return (dividend + divisor - 1) / divisor;
+}
+
+static bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct cgv_node *cgc_a, *cgc_b;
+
+	cgc_a = container_of(a, struct cgv_node, rb_node);
+	cgc_b = container_of(b, struct cgv_node, rb_node);
+
+	return cgc_a->cvtime < cgc_b->cvtime;
+}
+
+static struct fcg_cpu_ctx *find_cpu_ctx(void)
+{
+	struct fcg_cpu_ctx *cpuc;
+	u32 idx = 0;
+
+	cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
+	if (!cpuc) {
+		scx_bpf_error("cpu_ctx lookup failed");
+		return NULL;
+	}
+	return cpuc;
+}
+
+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
+{
+	struct fcg_cgrp_ctx *cgc;
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+	if (!cgc) {
+		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
+		return NULL;
+	}
+	return cgc;
+}
+
+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
+{
+	struct fcg_cgrp_ctx *cgc;
+
+	cgrp = bpf_cgroup_ancestor(cgrp, level);
+	if (!cgrp) {
+		scx_bpf_error("ancestor cgroup lookup failed");
+		return NULL;
+	}
+
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		scx_bpf_error("ancestor cgrp_ctx lookup failed");
+	bpf_cgroup_release(cgrp);
+	return cgc;
+}
+
+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
+{
+	int level;
+
+	if (!cgc->nr_active) {
+		stat_inc(FCG_STAT_HWT_SKIP);
+		return;
+	}
+
+	if (cgc->hweight_gen == hweight_gen) {
+		stat_inc(FCG_STAT_HWT_CACHE);
+		return;
+	}
+
+	stat_inc(FCG_STAT_HWT_UPDATES);
+	bpf_for(level, 0, cgrp->level + 1) {
+		struct fcg_cgrp_ctx *cgc;
+		bool is_active;
+
+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
+		if (!cgc)
+			break;
+
+		if (!level) {
+			cgc->hweight = FCG_HWEIGHT_ONE;
+			cgc->hweight_gen = hweight_gen;
+		} else {
+			struct fcg_cgrp_ctx *pcgc;
+
+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
+			if (!pcgc)
+				break;
+
+			/*
+			 * We can be oppotunistic here and not grab the
+			 * cgv_tree_lock and deal with the occasional races.
+			 * However, hweight updates are already cached and
+			 * relatively low-frequency. Let's just do the
+			 * straightforward thing.
+			 */
+			bpf_spin_lock(&cgv_tree_lock);
+			is_active = cgc->nr_active;
+			if (is_active) {
+				cgc->hweight_gen = pcgc->hweight_gen;
+				cgc->hweight =
+					div_round_up(pcgc->hweight * cgc->weight,
+						     pcgc->child_weight_sum);
+			}
+			bpf_spin_unlock(&cgv_tree_lock);
+
+			if (!is_active) {
+				stat_inc(FCG_STAT_HWT_RACE);
+				break;
+			}
+		}
+	}
+}
+
+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
+{
+	u64 delta, cvtime, max_budget;
+
+	/*
+	 * A node which is on the rbtree can't be pointed to from elsewhere yet
+	 * and thus can't be updated and repositioned. Instead, we collect the
+	 * vtime deltas separately and apply it asynchronously here.
+	 */
+	delta = cgc->cvtime_delta;
+	__sync_fetch_and_sub(&cgc->cvtime_delta, delta);
+	cvtime = cgv_node->cvtime + delta;
+
+	/*
+	 * Allow a cgroup to carry the maximum budget proportional to its
+	 * hweight such that a full-hweight cgroup can immediately take up half
+	 * of the CPUs at the most while staying at the front of the rbtree.
+	 */
+	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
+		(2 * FCG_HWEIGHT_ONE);
+	if (vtime_before(cvtime, cvtime_now - max_budget))
+		cvtime = cvtime_now - max_budget;
+
+	cgv_node->cvtime = cvtime;
+}
+
+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
+{
+	struct cgv_node_stash *stash;
+	struct cgv_node *cgv_node;
+	u64 cgid = cgrp->kn->id;
+
+	/* paired with cmpxchg in try_pick_next_cgroup() */
+	if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
+		stat_inc(FCG_STAT_ENQ_SKIP);
+		return;
+	}
+
+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+	if (!stash) {
+		scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
+		return;
+	}
+
+	/* NULL if the node is already on the rbtree */
+	cgv_node = bpf_kptr_xchg(&stash->node, NULL);
+	if (!cgv_node) {
+		stat_inc(FCG_STAT_ENQ_RACE);
+		return;
+	}
+
+	bpf_spin_lock(&cgv_tree_lock);
+	cgrp_cap_budget(cgv_node, cgc);
+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+	bpf_spin_unlock(&cgv_tree_lock);
+}
+
+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	/*
+	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
+	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
+	 * the fact. Use the same mechanism to deal with tasks with custom
+	 * affinities so that we don't have to worry about per-cgroup dq's
+	 * containing tasks that can't be executed from some CPUs.
+	 */
+	if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
+		/*
+		 * Tell fcg_stopping() that this bypassed the regular scheduling
+		 * path and should be force charged to the cgroup. 0 is used to
+		 * indicate that the task isn't bypassing, so if the current
+		 * runtime is 0, go back by one nanosecond.
+		 */
+		taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
+
+		/*
+		 * The global dq is deprioritized as we don't want to let tasks
+		 * to boost themselves by constraining its cpumask. The
+		 * deprioritization is rather severe, so let's not apply that to
+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
+		 * implement per-cgroup fallback dq's instead so that we have
+		 * more control over when tasks with custom cpumask get issued.
+		 */
+		if ((enq_flags & SCX_ENQ_LOCAL) ||
+		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
+			stat_inc(FCG_STAT_LOCAL);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		} else {
+			stat_inc(FCG_STAT_GLOBAL);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		}
+		return;
+	}
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		goto out_release;
+
+	scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+
+	cgrp_enqueued(cgrp, cgc);
+out_release:
+	bpf_cgroup_release(cgrp);
+}
+
+/*
+ * Walk the cgroup tree to update the active weight sums as tasks wake up and
+ * sleep. The weight sums are used as the base when calculating the proportion a
+ * given cgroup or task is entitled to at each level.
+ */
+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
+{
+	struct fcg_cgrp_ctx *cgc;
+	bool updated = false;
+	int idx;
+
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		return;
+
+	/*
+	 * In most cases, a hot cgroup would have multiple threads going to
+	 * sleep and waking up while the whole cgroup stays active. In leaf
+	 * cgroups, ->nr_runnable which is updated with __sync operations gates
+	 * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
+	 * repeatedly for a busy cgroup which is staying active.
+	 */
+	if (runnable) {
+		if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
+			return;
+		stat_inc(FCG_STAT_ACT);
+	} else {
+		if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
+			return;
+		stat_inc(FCG_STAT_DEACT);
+	}
+
+	/*
+	 * If @cgrp is becoming runnable, its hweight should be refreshed after
+	 * it's added to the weight tree so that enqueue has the up-to-date
+	 * value. If @cgrp is becoming quiescent, the hweight should be
+	 * refreshed before it's removed from the weight tree so that the usage
+	 * charging which happens afterwards has access to the latest value.
+	 */
+	if (!runnable)
+		cgrp_refresh_hweight(cgrp, cgc);
+
+	/* propagate upwards */
+	bpf_for(idx, 0, cgrp->level) {
+		int level = cgrp->level - idx;
+		struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
+		bool propagate = false;
+
+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
+		if (!cgc)
+			break;
+		if (level) {
+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
+			if (!pcgc)
+				break;
+		}
+
+		/*
+		 * We need the propagation protected by a lock to synchronize
+		 * against weight changes. There's no reason to drop the lock at
+		 * each level but bpf_spin_lock() doesn't want any function
+		 * calls while locked.
+		 */
+		bpf_spin_lock(&cgv_tree_lock);
+
+		if (runnable) {
+			if (!cgc->nr_active++) {
+				updated = true;
+				if (pcgc) {
+					propagate = true;
+					pcgc->child_weight_sum += cgc->weight;
+				}
+			}
+		} else {
+			if (!--cgc->nr_active) {
+				updated = true;
+				if (pcgc) {
+					propagate = true;
+					pcgc->child_weight_sum -= cgc->weight;
+				}
+			}
+		}
+
+		bpf_spin_unlock(&cgv_tree_lock);
+
+		if (!propagate)
+			break;
+	}
+
+	if (updated)
+		__sync_fetch_and_add(&hweight_gen, 1);
+
+	if (runnable)
+		cgrp_refresh_hweight(cgrp, cgc);
+}
+
+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
+{
+	struct cgroup *cgrp;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	update_active_weight_sums(cgrp, true);
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	if (!taskc->bypassed_at)
+		return;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgc = find_cgrp_ctx(cgrp);
+	if (cgc) {
+		__sync_fetch_and_add(&cgc->cvtime_delta,
+				     p->se.sum_exec_runtime - taskc->bypassed_at);
+		taskc->bypassed_at = 0;
+	}
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
+{
+	struct cgroup *cgrp;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	update_active_weight_sums(cgrp, false);
+	bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{
+	struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
+
+	cgc = find_cgrp_ctx(cgrp);
+	if (!cgc)
+		return;
+
+	if (cgrp->level) {
+		pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
+		if (!pcgc)
+			return;
+	}
+
+	bpf_spin_lock(&cgv_tree_lock);
+	if (pcgc && cgc->nr_active)
+		pcgc->child_weight_sum += (s64)weight - cgc->weight;
+	cgc->weight = weight;
+	bpf_spin_unlock(&cgv_tree_lock);
+}
+
+static bool try_pick_next_cgroup(u64 *cgidp)
+{
+	struct bpf_rb_node *rb_node;
+	struct cgv_node_stash *stash;
+	struct cgv_node *cgv_node;
+	struct fcg_cgrp_ctx *cgc;
+	struct cgroup *cgrp;
+	u64 cgid;
+
+	/* pop the front cgroup and wind cvtime_now accordingly */
+	bpf_spin_lock(&cgv_tree_lock);
+
+	rb_node = bpf_rbtree_first(&cgv_tree);
+	if (!rb_node) {
+		bpf_spin_unlock(&cgv_tree_lock);
+		stat_inc(FCG_STAT_PNC_NO_CGRP);
+		*cgidp = 0;
+		return true;
+	}
+
+	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
+	bpf_spin_unlock(&cgv_tree_lock);
+
+	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
+	cgid = cgv_node->cgid;
+
+	if (vtime_before(cvtime_now, cgv_node->cvtime))
+		cvtime_now = cgv_node->cvtime;
+
+	/*
+	 * If lookup fails, the cgroup's gone. Free and move on. See
+	 * fcg_cgroup_exit().
+	 */
+	cgrp = bpf_cgroup_from_id(cgid);
+	if (!cgrp) {
+		stat_inc(FCG_STAT_PNC_GONE);
+		goto out_free;
+	}
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+	if (!cgc) {
+		bpf_cgroup_release(cgrp);
+		stat_inc(FCG_STAT_PNC_GONE);
+		goto out_free;
+	}
+
+	if (!scx_bpf_consume(cgid)) {
+		bpf_cgroup_release(cgrp);
+		stat_inc(FCG_STAT_PNC_EMPTY);
+		goto out_stash;
+	}
+
+	/*
+	 * Successfully consumed from the cgroup. This will be our current
+	 * cgroup for the new slice. Refresh its hweight.
+	 */
+	cgrp_refresh_hweight(cgrp, cgc);
+
+	bpf_cgroup_release(cgrp);
+
+	/*
+	 * As the cgroup may have more tasks, add it back to the rbtree. Note
+	 * that here we charge the full slice upfront and then exact later
+	 * according to the actual consumption. This prevents lowpri thundering
+	 * herd from saturating the machine.
+	 */
+	bpf_spin_lock(&cgv_tree_lock);
+	cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
+	cgrp_cap_budget(cgv_node, cgc);
+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+	bpf_spin_unlock(&cgv_tree_lock);
+
+	*cgidp = cgid;
+	stat_inc(FCG_STAT_PNC_NEXT);
+	return true;
+
+out_stash:
+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+	if (!stash) {
+		stat_inc(FCG_STAT_PNC_GONE);
+		goto out_free;
+	}
+
+	/*
+	 * Paired with cmpxchg in cgrp_enqueued(). If they see the following
+	 * transition, they'll enqueue the cgroup. If they are earlier, we'll
+	 * see their task in the dq below and requeue the cgroup.
+	 */
+	__sync_val_compare_and_swap(&cgc->queued, 1, 0);
+
+	if (scx_bpf_dsq_nr_queued(cgid)) {
+		bpf_spin_lock(&cgv_tree_lock);
+		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+		bpf_spin_unlock(&cgv_tree_lock);
+	} else {
+		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
+		if (cgv_node) {
+			scx_bpf_error("unexpected !NULL cgv_node stash");
+			goto out_free;
+		}
+	}
+
+	return false;
+
+out_free:
+	bpf_obj_drop(cgv_node);
+	return false;
+}
+
+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
+{
+	struct fcg_cpu_ctx *cpuc;
+	struct fcg_cgrp_ctx *cgc;
+	struct cgroup *cgrp;
+	u64 now = bpf_ktime_get_ns();
+
+	cpuc = find_cpu_ctx();
+	if (!cpuc)
+		return;
+
+	if (!cpuc->cur_cgid)
+		goto pick_next_cgroup;
+
+	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
+		if (scx_bpf_consume(cpuc->cur_cgid)) {
+			stat_inc(FCG_STAT_CNS_KEEP);
+			return;
+		}
+		stat_inc(FCG_STAT_CNS_EMPTY);
+	} else {
+		stat_inc(FCG_STAT_CNS_EXPIRE);
+	}
+
+	/*
+	 * The current cgroup is expiring. It was already charged a full slice.
+	 * Calculate the actual usage and accumulate the delta.
+	 */
+	cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
+	if (!cgrp) {
+		stat_inc(FCG_STAT_CNS_GONE);
+		goto pick_next_cgroup;
+	}
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+	if (cgc) {
+		/*
+		 * We want to update the vtime delta and then look for the next
+		 * cgroup to execute but the latter needs to be done in a loop
+		 * and we can't keep the lock held. Oh well...
+		 */
+		bpf_spin_lock(&cgv_tree_lock);
+		__sync_fetch_and_add(&cgc->cvtime_delta,
+				     (cpuc->cur_at + cgrp_slice_ns - now) *
+				     FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
+		bpf_spin_unlock(&cgv_tree_lock);
+	} else {
+		stat_inc(FCG_STAT_CNS_GONE);
+	}
+
+	bpf_cgroup_release(cgrp);
+
+pick_next_cgroup:
+	cpuc->cur_at = now;
+
+	if (scx_bpf_consume(SCX_DSQ_GLOBAL)) {
+		cpuc->cur_cgid = 0;
+		return;
+	}
+
+	bpf_repeat(BPF_MAX_LOOPS) {
+		if (try_pick_next_cgroup(&cpuc->cur_cgid))
+			break;
+	}
+}
+
+s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	struct fcg_task_ctx *taskc;
+
+	/*
+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
+	 * in this function and the following will automatically use GFP_KERNEL.
+	 */
+	taskc = bpf_task_storage_get(&task_ctx, p, 0,
+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!taskc)
+		return -ENOMEM;
+
+	taskc->bypassed_at = 0;
+	return 0;
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
+			     struct scx_cgroup_init_args *args)
+{
+	struct fcg_cgrp_ctx *cgc;
+	struct cgv_node *cgv_node;
+	struct cgv_node_stash empty_stash = {}, *stash;
+	u64 cgid = cgrp->kn->id;
+	int ret;
+
+	/*
+	 * Technically incorrect as cgroup ID is full 64bit while dq ID is
+	 * 63bit. Should not be a problem in practice and easy to spot in the
+	 * unlikely case that it breaks.
+	 */
+	ret = scx_bpf_create_dsq(cgid, -1);
+	if (ret)
+		return ret;
+
+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!cgc) {
+		ret = -ENOMEM;
+		goto err_destroy_dsq;
+	}
+
+	cgc->weight = args->weight;
+	cgc->hweight = FCG_HWEIGHT_ONE;
+
+	ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
+				  BPF_NOEXIST);
+	if (ret) {
+		if (ret != -ENOMEM)
+			scx_bpf_error("unexpected stash creation error (%d)",
+				      ret);
+		goto err_destroy_dsq;
+	}
+
+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+	if (!stash) {
+		scx_bpf_error("unexpected cgv_node stash lookup failure");
+		ret = -ENOENT;
+		goto err_destroy_dsq;
+	}
+
+	cgv_node = bpf_obj_new(struct cgv_node);
+	if (!cgv_node) {
+		ret = -ENOMEM;
+		goto err_del_cgv_node;
+	}
+
+	cgv_node->cgid = cgid;
+	cgv_node->cvtime = cvtime_now;
+
+	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
+	if (cgv_node) {
+		scx_bpf_error("unexpected !NULL cgv_node stash");
+		ret = -EBUSY;
+		goto err_drop;
+	}
+
+	return 0;
+
+err_drop:
+	bpf_obj_drop(cgv_node);
+err_del_cgv_node:
+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
+err_destroy_dsq:
+	scx_bpf_destroy_dsq(cgid);
+	return ret;
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+
+	/*
+	 * For now, there's no way find and remove the cgv_node if it's on the
+	 * cgv_tree. Let's drain them in the dispatch path as they get popped
+	 * off the front of the tree.
+	 */
+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
+	scx_bpf_destroy_dsq(cgid);
+}
+
+s32 BPF_STRUCT_OPS(fcg_init)
+{
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops flatcg_ops = {
+	.enqueue		= (void *)fcg_enqueue,
+	.dispatch		= (void *)fcg_dispatch,
+	.runnable		= (void *)fcg_runnable,
+	.stopping		= (void *)fcg_stopping,
+	.quiescent		= (void *)fcg_quiescent,
+	.prep_enable		= (void *)fcg_prep_enable,
+	.cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
+	.cgroup_init		= (void *)fcg_cgroup_init,
+	.cgroup_exit		= (void *)fcg_cgroup_exit,
+	.init			= (void *)fcg_init,
+	.exit			= (void *)fcg_exit,
+	.flags			= SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING,
+	.name			= "flatcg",
+};
diff --git a/tools/sched_ext/scx_example_flatcg.c b/tools/sched_ext/scx_example_flatcg.c
new file mode 100644
index 0000000000000..150f7e16996e6
--- /dev/null
+++ b/tools/sched_ext/scx_example_flatcg.c
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <time.h>
+#include <assert.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_flatcg.h"
+#include "scx_example_flatcg.skel.h"
+
+#ifndef FILEID_KERNFS
+#define FILEID_KERNFS		0xfe
+#endif
+
+const char help_fmt[] =
+"A flattened cgroup hierarchy sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-p]\n"
+"\n"
+"  -s SLICE_US   Override slice duration\n"
+"  -i INTERVAL   Report interval\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
+{
+	FILE *fp;
+	char buf[4096];
+	char *line, *cur = NULL, *tok;
+	__u64 sum = 0, idle = 0;
+	__u64 delta_sum, delta_idle;
+	int idx;
+
+	fp = fopen("/proc/stat", "r");
+	if (!fp) {
+		perror("fopen(\"/proc/stat\")");
+		return 0.0;
+	}
+
+	if (!fgets(buf, sizeof(buf), fp)) {
+		perror("fgets(\"/proc/stat\")");
+		fclose(fp);
+		return 0.0;
+	}
+	fclose(fp);
+
+	line = buf;
+	for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
+		char *endp = NULL;
+		__u64 v;
+
+		if (idx == 0) {
+			line = NULL;
+			continue;
+		}
+		v = strtoull(tok, &endp, 0);
+		if (!endp || *endp != '\0') {
+			fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
+				idx, tok);
+			continue;
+		}
+		sum += v;
+		if (idx == 4)
+			idle = v;
+	}
+
+	delta_sum = sum - *last_sum;
+	delta_idle = idle - *last_idle;
+	*last_sum = sum;
+	*last_idle = idle;
+
+	return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
+}
+
+static void fcg_read_stats(struct scx_example_flatcg *skel, __u64 *stats)
+{
+	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
+	__u32 idx;
+
+	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
+
+	for (idx = 0; idx < FCG_NR_STATS; idx++) {
+		int ret, cpu;
+
+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+					  &idx, cnts[idx]);
+		if (ret < 0)
+			continue;
+		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
+			stats[idx] += cnts[idx][cpu];
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_flatcg *skel;
+	struct bpf_link *link;
+	struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
+	bool dump_cgrps = false;
+	__u64 last_cpu_sum = 0, last_cpu_idle = 0;
+	__u64 last_stats[FCG_NR_STATS] = {};
+	unsigned long seq = 0;
+	s32 opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_flatcg__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open: %s\n", strerror(errno));
+		return 1;
+	}
+
+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+
+	while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) {
+		double v;
+
+		switch (opt) {
+		case 's':
+			v = strtod(optarg, NULL);
+			skel->rodata->cgrp_slice_ns = v * 1000;
+			break;
+		case 'i':
+			v = strtod(optarg, NULL);
+			intv_ts.tv_sec = v;
+			intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
+			break;
+		case 'd':
+			dump_cgrps = true;
+			break;
+		case 'p':
+			skel->rodata->switch_partial = true;
+			break;
+		case 'h':
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
+	       (double)skel->rodata->cgrp_slice_ns / 1000000.0,
+	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
+	       dump_cgrps);
+
+	if (scx_example_flatcg__load(skel)) {
+		fprintf(stderr, "Failed to load: %s\n", strerror(errno));
+		return 1;
+	}
+
+	link = bpf_map__attach_struct_ops(skel->maps.flatcg_ops);
+	if (!link) {
+		fprintf(stderr, "Failed to attach_struct_ops: %s\n",
+			strerror(errno));
+		return 1;
+	}
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		__u64 acc_stats[FCG_NR_STATS];
+		__u64 stats[FCG_NR_STATS];
+		float cpu_util;
+		int i;
+
+		cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
+
+		fcg_read_stats(skel, acc_stats);
+		for (i = 0; i < FCG_NR_STATS; i++)
+			stats[i] = acc_stats[i] - last_stats[i];
+
+		memcpy(last_stats, acc_stats, sizeof(acc_stats));
+
+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
+		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
+		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
+		       stats[FCG_STAT_ACT],
+		       stats[FCG_STAT_DEACT],
+		       stats[FCG_STAT_LOCAL],
+		       stats[FCG_STAT_GLOBAL]);
+		printf("HWT   skip:%6llu   race:%6llu cache:%6llu update:%6llu\n",
+		       stats[FCG_STAT_HWT_SKIP],
+		       stats[FCG_STAT_HWT_RACE],
+		       stats[FCG_STAT_HWT_CACHE],
+		       stats[FCG_STAT_HWT_UPDATES]);
+		printf("ENQ   skip:%6llu   race:%6llu\n",
+		       stats[FCG_STAT_ENQ_SKIP],
+		       stats[FCG_STAT_ENQ_RACE]);
+		printf("CNS   keep:%6llu expire:%6llu empty:%6llu   gone:%6llu\n",
+		       stats[FCG_STAT_CNS_KEEP],
+		       stats[FCG_STAT_CNS_EXPIRE],
+		       stats[FCG_STAT_CNS_EMPTY],
+		       stats[FCG_STAT_CNS_GONE]);
+		printf("PNC nocgrp:%6llu   next:%6llu empty:%6llu   gone:%6llu\n",
+		       stats[FCG_STAT_PNC_NO_CGRP],
+		       stats[FCG_STAT_PNC_NEXT],
+		       stats[FCG_STAT_PNC_EMPTY],
+		       stats[FCG_STAT_PNC_GONE]);
+		printf("BAD remove:%6llu\n",
+		       acc_stats[FCG_STAT_BAD_REMOVAL]);
+
+		nanosleep(&intv_ts, NULL);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_flatcg__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_flatcg.h b/tools/sched_ext/scx_example_flatcg.h
new file mode 100644
index 0000000000000..490758ed41f0f
--- /dev/null
+++ b/tools/sched_ext/scx_example_flatcg.h
@@ -0,0 +1,49 @@
+#ifndef __SCX_EXAMPLE_FLATCG_H
+#define __SCX_EXAMPLE_FLATCG_H
+
+enum {
+	FCG_HWEIGHT_ONE		= 1LLU << 16,
+};
+
+enum fcg_stat_idx {
+	FCG_STAT_ACT,
+	FCG_STAT_DEACT,
+	FCG_STAT_LOCAL,
+	FCG_STAT_GLOBAL,
+
+	FCG_STAT_HWT_UPDATES,
+	FCG_STAT_HWT_CACHE,
+	FCG_STAT_HWT_SKIP,
+	FCG_STAT_HWT_RACE,
+
+	FCG_STAT_ENQ_SKIP,
+	FCG_STAT_ENQ_RACE,
+
+	FCG_STAT_CNS_KEEP,
+	FCG_STAT_CNS_EXPIRE,
+	FCG_STAT_CNS_EMPTY,
+	FCG_STAT_CNS_GONE,
+
+	FCG_STAT_PNC_NO_CGRP,
+	FCG_STAT_PNC_NEXT,
+	FCG_STAT_PNC_EMPTY,
+	FCG_STAT_PNC_GONE,
+
+	FCG_STAT_BAD_REMOVAL,
+
+	FCG_NR_STATS,
+};
+
+struct fcg_cgrp_ctx {
+	u32			nr_active;
+	u32			nr_runnable;
+	u32			queued;
+	u32			weight;
+	u32			hweight;
+	u64			child_weight_sum;
+	u64			hweight_gen;
+	s64			cvtime_delta;
+	u64			tvtime_now;
+};
+
+#endif /* __SCX_EXAMPLE_FLATCG_H */
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
new file mode 100644
index 0000000000000..e5ff39083181f
--- /dev/null
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -0,0 +1,536 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext core-scheduler which always makes every sibling CPU pair
+ * execute from the same CPU cgroup.
+ *
+ * This scheduler is a minimal implementation and would need some form of
+ * priority handling both inside each cgroup and across the cgroups to be
+ * practically useful.
+ *
+ * Each CPU in the system is paired with exactly one other CPU, according to a
+ * "stride" value that can be specified when the BPF scheduler program is first
+ * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
+ * that they will only ever schedule tasks that belong to the same CPU cgroup.
+ *
+ * Scheduler Initialization
+ * ------------------------
+ *
+ * The scheduler BPF program is first initialized from user space, before it is
+ * enabled. During this initialization process, each CPU on the system is
+ * assigned several values that are constant throughout its runtime:
+ *
+ * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling
+ *		  decisions. Paired CPUs always schedule tasks from the same
+ *		  CPU cgroup, and synchronize with each other to guarantee
+ *		  that this constraint is not violated.
+ * 2. *Pair ID*:  Each CPU pair is assigned a Pair ID, which is used to access
+ *		  a struct pair_ctx object that is shared between the pair.
+ * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the
+ *		       pair. Each struct pair_ctx has an active_mask field,
+ *		       which is a bitmap used to indicate whether each core
+ *		       in the pair currently has an actively running task.
+ *		       This index specifies which entry in the bitmap corresponds
+ *		       to each CPU in the pair.
+ *
+ * During this initialization, the CPUs are paired according to a "stride" that
+ * may be specified when invoking the user space program that initializes and
+ * loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
+ *
+ * Tasks and cgroups
+ * -----------------
+ *
+ * Every cgroup in the system is registered with the scheduler using the
+ * pair_cgroup_init() callback, and every task in the system is associated with
+ * exactly one cgroup. At a high level, the idea with the pair scheduler is to
+ * always schedule tasks from the same cgroup within a given CPU pair. When a
+ * task is enqueued (i.e. passed to the pair_enqueue() callback function), its
+ * cgroup ID is read from its task struct, and then a corresponding queue map
+ * is used to FIFO-enqueue the task for that cgroup.
+ *
+ * If you look through the implementation of the scheduler, you'll notice that
+ * there is quite a bit of complexity involved with looking up the per-cgroup
+ * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
+ * BPF hash map that is used to map a cgroup ID to a globally unique ID that's
+ * allocated in the BPF program. This is done because we use separate maps to
+ * store the FIFO queue of tasks, and the length of that map, per cgroup. This
+ * complexity is only present because of current deficiencies in BPF that will
+ * soon be addressed. The main point to keep in mind is that newly enqueued
+ * tasks are added to their cgroup's FIFO queue.
+ *
+ * Dispatching tasks
+ * -----------------
+ *
+ * This section will describe how enqueued tasks are dispatched and scheduled.
+ * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
+ * as follows:
+ *
+ * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
+ *    the structure that's used to synchronize amongst the two pair CPUs in their
+ *    scheduling decisions. After any of the following events have occurred:
+ *
+ * - The cgroup's slice run has expired, or
+ * - The cgroup becomes empty, or
+ * - Either CPU in the pair is preempted by a higher priority scheduling class
+ *
+ * The cgroup transitions to the draining state and stops executing new tasks
+ * from the cgroup.
+ *
+ * 2. If the pair is still executing a task, mark the pair_ctx as draining, and
+ *    wait for the pair CPU to be preempted.
+ *
+ * 3. Otherwise, if the pair CPU is not running a task, we can move onto
+ *    scheduling new tasks. Pop the next cgroup id from the top_q queue.
+ *
+ * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
+ *
+ * Note again that this scheduling behavior is simple, but the implementation
+ * is complex mostly because this it hits several BPF shortcomings and has to
+ * work around in often awkward ways. Most of the shortcomings are expected to
+ * be resolved in the near future which should allow greatly simplifying this
+ * scheduler.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+#include "scx_example_pair.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool switch_partial;
+
+/* !0 for veristat, set during init */
+const volatile u32 nr_cpu_ids = 64;
+
+/* a pair of CPUs stay on a cgroup for this duration */
+const volatile u32 pair_batch_dur_ns = SCX_SLICE_DFL;
+
+/* cpu ID -> pair cpu ID */
+const volatile s32 pair_cpu[MAX_CPUS] = { [0 ... MAX_CPUS - 1] = -1 };
+
+/* cpu ID -> pair_id */
+const volatile u32 pair_id[MAX_CPUS];
+
+/* CPU ID -> CPU # in the pair (0 or 1) */
+const volatile u32 in_pair_idx[MAX_CPUS];
+
+struct pair_ctx {
+	struct bpf_spin_lock	lock;
+
+	/* the cgroup the pair is currently executing */
+	u64			cgid;
+
+	/* the pair started executing the current cgroup at */
+	u64			started_at;
+
+	/* whether the current cgroup is draining */
+	bool			draining;
+
+	/* the CPUs that are currently active on the cgroup */
+	u32			active_mask;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, MAX_CPUS / 2);
+	__type(key, u32);
+	__type(value, struct pair_ctx);
+} pair_ctx SEC(".maps");
+
+/* queue of cgrp_q's possibly with tasks on them */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	/*
+	 * Because it's difficult to build strong synchronization encompassing
+	 * multiple non-trivial operations in BPF, this queue is managed in an
+	 * opportunistic way so that we guarantee that a cgroup w/ active tasks
+	 * is always on it but possibly multiple times. Once we have more robust
+	 * synchronization constructs and e.g. linked list, we should be able to
+	 * do this in a prettier way but for now just size it big enough.
+	 */
+	__uint(max_entries, 4 * MAX_CGRPS);
+	__type(value, u64);
+} top_q SEC(".maps");
+
+/* per-cgroup q which FIFOs the tasks from the cgroup */
+struct cgrp_q {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_QUEUED);
+	__type(value, u32);
+};
+
+/*
+ * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
+ * storage; however, a cgroup local storage can only be accessed from the BPF
+ * progs attached to the cgroup. For now, work around by allocating array of
+ * cgrp_q's and then allocating per-cgroup indices.
+ *
+ * Another caveat: It's difficult to populate a large array of maps statically
+ * or from BPF. Initialize it from userland.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, MAX_CGRPS);
+	__type(key, s32);
+	__array(values, struct cgrp_q);
+} cgrp_q_arr SEC(".maps");
+
+static u64 cgrp_q_len[MAX_CGRPS];
+
+/*
+ * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
+ * useful to have as a map type.
+ */
+static u32 cgrp_q_idx_cursor;
+static u64 cgrp_q_idx_busy[MAX_CGRPS];
+
+/*
+ * All added up, the following is what we do:
+ *
+ * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
+ *    for a free ID. If not found, fail cgroup creation with -EBUSY.
+ *
+ * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
+ *    cgrp_q_idx_hash.
+ *
+ * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
+ *    cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
+ *
+ * This is sadly complicated for something pretty simple. Hopefully, we should
+ * be able to simplify in the future.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_CGRPS);
+	__uint(key_size, sizeof(u64));		/* cgrp ID */
+	__uint(value_size, sizeof(s32));	/* cgrp_q idx */
+} cgrp_q_idx_hash SEC(".maps");
+
+/* statistics */
+u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
+u64 nr_exps, nr_exp_waits, nr_exp_empty;
+u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;
+
+struct user_exit_info uei;
+
+static bool time_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct cgroup *cgrp;
+	struct cgrp_q *cgq;
+	s32 pid = p->pid;
+	u64 cgid;
+	u32 *q_idx;
+	u64 *cgq_len;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgid = cgrp->kn->id;
+	bpf_cgroup_release(cgrp);
+
+	/* find the cgroup's q and push @p into it */
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!q_idx) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return;
+	}
+
+	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
+	if (!cgq) {
+		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
+			      cgid, *q_idx);
+		return;
+	}
+
+	if (bpf_map_push_elem(cgq, &pid, 0)) {
+		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
+		return;
+	}
+
+	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
+	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+	if (!cgq_len) {
+		scx_bpf_error("MEMBER_VTPR malfunction");
+		return;
+	}
+
+	if (!__sync_fetch_and_add(cgq_len, 1) &&
+	    bpf_map_push_elem(&top_q, &cgid, 0)) {
+		scx_bpf_error("top_q overflow");
+		return;
+	}
+}
+
+static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
+{
+	u32 *vptr;
+
+	vptr = (u32 *)MEMBER_VPTR(pair_id, [cpu]);
+	if (!vptr)
+		return -EINVAL;
+
+	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
+	if (!(*pairc))
+		return -EINVAL;
+
+	vptr = (u32 *)MEMBER_VPTR(in_pair_idx, [cpu]);
+	if (!vptr)
+		return -EINVAL;
+
+	*mask = 1U << *vptr;
+
+	return 0;
+}
+
+static int try_dispatch(s32 cpu)
+{
+	struct pair_ctx *pairc;
+	struct bpf_map *cgq_map;
+	struct task_struct *p;
+	u64 now = bpf_ktime_get_ns();
+	bool kick_pair = false;
+	bool expired;
+	u32 *vptr, in_pair_mask;
+	s32 pid, q_idx;
+	u64 cgid;
+	int ret;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret) {
+		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
+			      cpu);
+		return -ENOENT;
+	}
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->active_mask &= ~in_pair_mask;
+
+	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
+	if (expired || pairc->draining) {
+		u64 new_cgid = 0;
+
+		__sync_fetch_and_add(&nr_exps, 1);
+
+		/*
+		 * We're done with the current cgid. An obvious optimization
+		 * would be not draining if the next cgroup is the current one.
+		 * For now, be dumb and always expire.
+		 */
+		pairc->draining = true;
+
+		if (pairc->active_mask) {
+			/*
+			 * The other CPU is still active We want to wait until
+			 * this cgroup expires.
+			 *
+			 * If the pair controls its CPU, and the time already
+			 * expired, kick.  When the other CPU arrives at
+			 * dispatch and clears its active mask, it'll push the
+			 * pair to the next cgroup and kick this CPU.
+			 */
+			__sync_fetch_and_add(&nr_exp_waits, 1);
+			bpf_spin_unlock(&pairc->lock);
+			if (expired)
+				kick_pair = true;
+			goto out_maybe_kick;
+		}
+
+		bpf_spin_unlock(&pairc->lock);
+
+		/*
+		 * Pick the next cgroup. It'd be easier / cleaner to not drop
+		 * pairc->lock and use stronger synchronization here especially
+		 * given that we'll be switching cgroups significantly less
+		 * frequently than tasks. Unfortunately, bpf_spin_lock can't
+		 * really protect anything non-trivial. Let's do opportunistic
+		 * operations instead.
+		 */
+		bpf_repeat(BPF_MAX_LOOPS) {
+			u32 *q_idx;
+			u64 *cgq_len;
+
+			if (bpf_map_pop_elem(&top_q, &new_cgid)) {
+				/* no active cgroup, go idle */
+				__sync_fetch_and_add(&nr_exp_empty, 1);
+				return 0;
+			}
+
+			q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
+			if (!q_idx)
+				continue;
+
+			/*
+			 * This is the only place where empty cgroups are taken
+			 * off the top_q.
+			 */
+			cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+			if (!cgq_len || !*cgq_len)
+				continue;
+
+			/*
+			 * If it has any tasks, requeue as we may race and not
+			 * execute it.
+			 */
+			bpf_map_push_elem(&top_q, &new_cgid, 0);
+			break;
+		}
+
+		bpf_spin_lock(&pairc->lock);
+
+		/*
+		 * The other CPU may already have started on a new cgroup while
+		 * we dropped the lock. Make sure that we're still draining and
+		 * start on the new cgroup.
+		 */
+		if (pairc->draining && !pairc->active_mask) {
+			__sync_fetch_and_add(&nr_cgrp_next, 1);
+			pairc->cgid = new_cgid;
+			pairc->started_at = now;
+			pairc->draining = false;
+			kick_pair = true;
+		} else {
+			__sync_fetch_and_add(&nr_cgrp_coll, 1);
+		}
+	}
+
+	cgid = pairc->cgid;
+	pairc->active_mask |= in_pair_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	/* again, it'd be better to do all these with the lock held, oh well */
+	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!vptr) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return -ENOENT;
+	}
+	q_idx = *vptr;
+
+	/* claim one task from cgrp_q w/ q_idx */
+	bpf_repeat(BPF_MAX_LOOPS) {
+		u64 *cgq_len, len;
+
+		cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
+		if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) {
+			/* the cgroup must be empty, expire and repeat */
+			__sync_fetch_and_add(&nr_cgrp_empty, 1);
+			bpf_spin_lock(&pairc->lock);
+			pairc->draining = true;
+			pairc->active_mask &= ~in_pair_mask;
+			bpf_spin_unlock(&pairc->lock);
+			return -EAGAIN;
+		}
+
+		if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
+			continue;
+
+		break;
+	}
+
+	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
+	if (!cgq_map) {
+		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
+			      cgid, q_idx);
+		return -ENOENT;
+	}
+
+	if (bpf_map_pop_elem(cgq_map, &pid)) {
+		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
+			      cgid, q_idx);
+		return -ENOENT;
+	}
+
+	p = bpf_task_from_pid(pid);
+	if (p) {
+		__sync_fetch_and_add(&nr_dispatched, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	} else {
+		/* we don't handle dequeues, retry on lost tasks */
+		__sync_fetch_and_add(&nr_missing, 1);
+		return -EAGAIN;
+	}
+
+out_maybe_kick:
+	if (kick_pair) {
+		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
+{
+	bpf_repeat(BPF_MAX_LOOPS) {
+		if (try_dispatch(cpu) != -EAGAIN)
+			break;
+	}
+}
+
+s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 i, q_idx;
+
+	bpf_for(i, 0, MAX_CGRPS) {
+		q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
+		if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
+			break;
+	}
+	if (i == MAX_CGRPS)
+		return -EBUSY;
+
+	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
+		if (busy)
+			*busy = 0;
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 *q_idx;
+
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (q_idx) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);
+		if (busy)
+			*busy = 0;
+		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
+	}
+}
+
+s32 BPF_STRUCT_OPS(pair_init)
+{
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops pair_ops = {
+	.enqueue		= (void *)pair_enqueue,
+	.dispatch		= (void *)pair_dispatch,
+	.cgroup_init		= (void *)pair_cgroup_init,
+	.cgroup_exit		= (void *)pair_cgroup_exit,
+	.init			= (void *)pair_init,
+	.exit			= (void *)pair_exit,
+	.name			= "pair",
+};
diff --git a/tools/sched_ext/scx_example_pair.c b/tools/sched_ext/scx_example_pair.c
new file mode 100644
index 0000000000000..18e032bbc173b
--- /dev/null
+++ b/tools/sched_ext/scx_example_pair.c
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_pair.h"
+#include "scx_example_pair.skel.h"
+
+const char help_fmt[] =
+"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
+"execute from the same CPU cgroup.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-S STRIDE] [-p]\n"
+"\n"
+"  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_pair *skel;
+	struct bpf_link *link;
+	u64 seq = 0;
+	s32 stride, i, opt, outer_fd;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_pair__open();
+	assert(skel);
+
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+
+	/* pair up the earlier half to the latter by default, override with -s */
+	stride = skel->rodata->nr_cpu_ids / 2;
+
+	while ((opt = getopt(argc, argv, "S:ph")) != -1) {
+		switch (opt) {
+		case 'S':
+			stride = strtoul(optarg, NULL, 0);
+			break;
+		case 'p':
+			skel->rodata->switch_partial = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
+		if (skel->rodata->pair_cpu[i] < 0) {
+			skel->rodata->pair_cpu[i] = i + stride;
+			skel->rodata->pair_cpu[i + stride] = i;
+			skel->rodata->pair_id[i] = i;
+			skel->rodata->pair_id[i + stride] = i;
+			skel->rodata->in_pair_idx[i] = 0;
+			skel->rodata->in_pair_idx[i + stride] = 1;
+		}
+	}
+
+	assert(!scx_example_pair__load(skel));
+
+	/*
+	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
+	 * queues. It'd probably be better to do this from BPF but there are too
+	 * many to initialize statically and there's no way to dynamically
+	 * populate from BPF.
+	 */
+	outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
+	assert(outer_fd >= 0);
+
+	printf("Initializing");
+        for (i = 0; i < MAX_CGRPS; i++) {
+		s32 inner_fd;
+
+		if (exit_req)
+			break;
+
+		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
+					  sizeof(u32), MAX_QUEUED, NULL);
+		assert(inner_fd >= 0);
+		assert(!bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY));
+		close(inner_fd);
+
+		if (!(i % 10))
+			printf(".");
+		fflush(stdout);
+        }
+	printf("\n");
+
+	/*
+	 * Fully initialized, attach and run.
+	 */
+	link = bpf_map__attach_struct_ops(skel->maps.pair_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		printf("[SEQ %lu]\n", seq++);
+		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_dispatched,
+		       skel->bss->nr_missing);
+		printf(" kicks:%10lu preemptions:%7lu\n",
+		       skel->bss->nr_kicks,
+		       skel->bss->nr_preemptions);
+		printf("   exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
+		       skel->bss->nr_exps,
+		       skel->bss->nr_exp_waits,
+		       skel->bss->nr_exp_empty);
+		printf("cgnext:%10lu   cgcoll:%10lu   cgempty:%10lu\n",
+		       skel->bss->nr_cgrp_next,
+		       skel->bss->nr_cgrp_coll,
+		       skel->bss->nr_cgrp_empty);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_pair__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_pair.h b/tools/sched_ext/scx_example_pair.h
new file mode 100644
index 0000000000000..f60b824272f75
--- /dev/null
+++ b/tools/sched_ext/scx_example_pair.h
@@ -0,0 +1,10 @@
+#ifndef __SCX_EXAMPLE_PAIR_H
+#define __SCX_EXAMPLE_PAIR_H
+
+enum {
+	MAX_CPUS		= 4096,
+	MAX_QUEUED		= 4096,
+	MAX_CGRPS		= 4096,
+};
+
+#endif /* __SCX_EXAMPLE_PAIR_H */

From f8ae50f03cff14f12ee6bff0d63c5c96f9996ce2 Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 025/304] sched_ext: Implement SCX_KICK_WAIT

If set when calling scx_bpf_kick_cpu(), the invoking CPU will busy wait for
the kicked cpu to enter the scheduler. This will be used to improve the
exclusion guarantees in scx_example_pair.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  |  4 +++-
 kernel/sched/ext.c   | 33 ++++++++++++++++++++++++++++++++-
 kernel/sched/ext.h   | 20 ++++++++++++++++++++
 kernel/sched/sched.h |  2 ++
 4 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0780414f3c156..ff51977968fb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6019,8 +6019,10 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	for_each_active_class(class) {
 		p = class->pick_next_task(rq);
-		if (p)
+		if (p) {
+			scx_notify_pick_next_task(rq, p, class);
 			return p;
+		}
 	}
 
 	BUG(); /* The idle class should always have a runnable task. */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f1c9616acff46..beb0de5c780a7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -126,6 +126,9 @@ static struct {
 static bool __cacheline_aligned_in_smp scx_has_idle_cpus;
 #endif	/* CONFIG_SMP */
 
+/* for %SCX_KICK_WAIT */
+static u64 __percpu *scx_kick_cpus_pnt_seqs;
+
 /*
  * Direct dispatch marker.
  *
@@ -3209,6 +3212,7 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
+	u64 *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
 	int this_cpu = cpu_of(this_rq);
 	int cpu;
 
@@ -3222,14 +3226,32 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 			if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
 			    rq->curr->sched_class == &ext_sched_class)
 				rq->curr->scx.slice = 0;
+			pseqs[cpu] = rq->scx.pnt_seq;
 			resched_curr(rq);
+		} else {
+			cpumask_clear_cpu(cpu, this_rq->scx.cpus_to_wait);
 		}
 
 		raw_spin_rq_unlock_irqrestore(rq, flags);
 	}
 
+	for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait,
+			    cpumask_of(this_cpu)) {
+		/*
+		 * Pairs with smp_store_release() issued by this CPU in
+		 * scx_notify_pick_next_task() on the resched path.
+		 *
+		 * We busy-wait here to guarantee that no other task can be
+		 * scheduled on our core before the target CPU has entered the
+		 * resched path.
+		 */
+		while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
+			cpu_relax();
+	}
+
 	cpumask_clear(this_rq->scx.cpus_to_kick);
 	cpumask_clear(this_rq->scx.cpus_to_preempt);
+	cpumask_clear(this_rq->scx.cpus_to_wait);
 }
 
 void __init init_sched_ext_class(void)
@@ -3243,7 +3265,7 @@ void __init init_sched_ext_class(void)
 	 * through the generated vmlinux.h.
 	 */
 	WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP |
-		   SCX_TG_ONLINE);
+		   SCX_TG_ONLINE | SCX_KICK_PREEMPT);
 
 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
@@ -3251,6 +3273,12 @@ void __init init_sched_ext_class(void)
 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
 #endif
+	scx_kick_cpus_pnt_seqs =
+		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) *
+			       num_possible_cpus(),
+			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
+	BUG_ON(!scx_kick_cpus_pnt_seqs);
+
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 
@@ -3259,6 +3287,7 @@ void __init init_sched_ext_class(void)
 
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
 	}
 
@@ -3527,6 +3556,8 @@ void scx_bpf_kick_cpu(s32 cpu, u64 flags)
 	cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
 	if (flags & SCX_KICK_PREEMPT)
 		cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
+	if (flags & SCX_KICK_WAIT)
+		cpumask_set_cpu(cpu, rq->scx.cpus_to_wait);
 
 	irq_work_queue(&rq->scx.kick_cpus_irq_work);
 	preempt_enable();
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 0c5a109e7e6d1..fc27b28acfdea 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -66,6 +66,7 @@ enum scx_tg_flags {
 
 enum scx_kick_flags {
 	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
+	SCX_KICK_WAIT		= 1LLU << 1,	/* wait for the CPU to be rescheduled */
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
@@ -95,6 +96,22 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
 #define scx_ops_error(fmt, args...)						\
 	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
+static inline void scx_notify_pick_next_task(struct rq *rq,
+					     const struct task_struct *p,
+					     const struct sched_class *active)
+{
+#ifdef CONFIG_SMP
+	if (!scx_enabled())
+		return;
+	/*
+	 * Pairs with the smp_load_acquire() issued by a CPU in
+	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
+	 * resched.
+	 */
+	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+#endif
+}
+
 static inline void scx_notify_sched_tick(void)
 {
 	unsigned long last_check;
@@ -149,6 +166,9 @@ static inline int scx_check_setscheduler(struct task_struct *p,
 					 int policy) { return 0; }
 static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline void init_sched_ext_class(void) {}
+static inline void scx_notify_pick_next_task(struct rq *rq,
+					     const struct task_struct *p,
+					     const struct sched_class *active) {}
 static inline void scx_notify_sched_tick(void) {}
 
 #define for_each_active_class		for_each_class
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3571cfda62a30..4b7f48239248b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -710,6 +710,8 @@ struct scx_rq {
 	u32			flags;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
+	cpumask_var_t		cpus_to_wait;
+	u64			pnt_seq;
 	struct irq_work		kick_cpus_irq_work;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */

From 7f1bb69418aad061a98b6e377f378bd28eb74f06 Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Thu, 13 Apr 2023 06:40:07 -1000
Subject: [PATCH 026/304] sched_ext: Implement
 sched_ext_ops.cpu_acquire/release()

Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_example_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_example_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

v3: * Drop the const qualifier from scx_cpu_release_args.task. BPF enforces
      access control through the verifier, so the qualifier isn't actually
      operative and only gets in the way when interacting with various
      helpers.

v2: * Add p->scx.kf_mask annotation to allow calling
      scx_bpf_reenqueue_local() from ops.cpu_release() nested inside
      ops.init() and other sleepable operations.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h              |  53 +++++++++-
 kernel/sched/ext.c                     | 131 ++++++++++++++++++++++++-
 kernel/sched/ext.h                     |  24 ++++-
 kernel/sched/sched.h                   |   1 +
 tools/sched_ext/scx_common.bpf.h       |   1 +
 tools/sched_ext/scx_example_pair.bpf.c | 101 ++++++++++++++++++-
 tools/sched_ext/scx_example_qmap.bpf.c |  37 ++++++-
 tools/sched_ext/scx_example_qmap.c     |   4 +-
 8 files changed, 340 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 9e47e320369d7..826da32e29ba0 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -135,6 +135,32 @@ struct scx_cgroup_init_args {
 	u32			weight;
 };
 
+enum scx_cpu_preempt_reason {
+	/* next task is being scheduled by &sched_class_rt */
+        SCX_CPU_PREEMPT_RT,
+	/* next task is being scheduled by &sched_class_dl */
+        SCX_CPU_PREEMPT_DL,
+	/* next task is being scheduled by &sched_class_stop */
+        SCX_CPU_PREEMPT_STOP,
+	/* unknown reason for SCX being preempted */
+        SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+	/* the reason the CPU was preempted */
+	enum scx_cpu_preempt_reason reason;
+
+	/* the task that's going to be scheduled on the CPU */
+	struct task_struct *task;
+};
+
 /**
  * struct sched_ext_ops - Operation table for BPF scheduler implementation
  *
@@ -330,6 +356,28 @@ struct sched_ext_ops {
 	 */
 	void (*update_idle)(s32 cpu, bool idle);
 
+	/**
+	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
+	 * @cpu: The CPU being acquired by the BPF scheduler.
+	 * @args: Acquire arguments, see the struct definition.
+	 *
+	 * A CPU that was previously released from the BPF scheduler is now once
+	 * again under its control.
+	 */
+	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+	/**
+	 * cpu_release - A CPU is taken away from the BPF scheduler
+	 * @cpu: The CPU being released by the BPF scheduler.
+	 * @args: Release arguments, see the struct definition.
+	 *
+	 * The specified CPU is no longer under the control of the BPF
+	 * scheduler. This could be because it was preempted by a higher
+	 * priority sched_class, though there may be other reasons as well. The
+	 * caller should consult @args->reason to determine the cause.
+	 */
+	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
 	/**
 	 * prep_enable - Prepare to enable BPF scheduling for a task
 	 * @p: task to prepare BPF scheduling for
@@ -534,12 +582,15 @@ enum scx_kf_mask {
 	/* all non-sleepables may be nested inside INIT and SLEEPABLE */
 	SCX_KF_INIT		= 1 << 0, /* running ops.init() */
 	SCX_KF_SLEEPABLE	= 1 << 1, /* other sleepable init operations */
+	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
+	SCX_KF_CPU_RELEASE	= 1 << 2, /* ops.cpu_release() */
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
 	SCX_KF_DISPATCH		= 1 << 3, /* ops.dispatch() */
 	SCX_KF_ENQUEUE		= 1 << 4, /* ops.enqueue() */
 	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
 
-	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_REST,
+	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
+				  SCX_KF_ENQUEUE | SCX_KF_REST,
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_REST,
 };
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index beb0de5c780a7..fb99c57184e1c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -83,6 +83,7 @@ static bool scx_warned_zero_slice;
 
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 
 struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
@@ -304,6 +305,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
 	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
 	 * boundary thanks to the above in_interrupt() check.
 	 */
+	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
+		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
+		scx_ops_error("cpu_release kfunc called from a nested operation");
+		return false;
+	}
+
 	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
 		scx_ops_error("dispatch kfunc called from a nested operation");
@@ -1377,6 +1384,19 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 
 	lockdep_assert_rq_held(rq);
 
+	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+	    unlikely(rq->scx.cpu_released)) {
+		/*
+		 * If the previous sched_class for the current CPU was not SCX,
+		 * notify the BPF scheduler that it again has control of the
+		 * core. This callback complements ->cpu_release(), which is
+		 * emitted in scx_notify_pick_next_task().
+		 */
+		if (SCX_HAS_OP(cpu_acquire))
+			SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+		rq->scx.cpu_released = false;
+	}
+
 	if (prev_on_scx) {
 		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
 		update_curr_scx(rq);
@@ -1384,7 +1404,9 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		/*
 		 * If @prev is runnable & has slice left, it has priority and
 		 * fetching more just increases latency for the fetched tasks.
-		 * Tell put_prev_task_scx() to put @prev on local_dsq.
+		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
+		 * BPF scheduler wants to handle this explicitly, it should
+		 * implement ->cpu_released().
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * disabling() test.
@@ -1590,6 +1612,58 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
 	return p;
 }
 
+static enum scx_cpu_preempt_reason
+preempt_reason_from_class(const struct sched_class *class)
+{
+#ifdef CONFIG_SMP
+	if (class == &stop_sched_class)
+		return SCX_CPU_PREEMPT_STOP;
+#endif
+	if (class == &dl_sched_class)
+		return SCX_CPU_PREEMPT_DL;
+	if (class == &rt_sched_class)
+		return SCX_CPU_PREEMPT_RT;
+	return SCX_CPU_PREEMPT_UNKNOWN;
+}
+
+void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
+				 const struct sched_class *active)
+{
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The callback is conceptually meant to convey that the CPU is no
+	 * longer under the control of SCX. Therefore, don't invoke the
+	 * callback if the CPU is is staying on SCX, or going idle (in which
+	 * case the SCX scheduler has actively decided not to schedule any
+	 * tasks on the CPU).
+	 */
+	if (likely(active >= &ext_sched_class))
+		return;
+
+	/*
+	 * At this point we know that SCX was preempted by a higher priority
+	 * sched_class, so invoke the ->cpu_release() callback if we have not
+	 * done so already. We only send the callback once between SCX being
+	 * preempted, and it regaining control of the CPU.
+	 *
+	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
+	 *  next time that balance_scx() is invoked.
+	 */
+	if (!rq->scx.cpu_released) {
+		if (SCX_HAS_OP(cpu_release)) {
+			struct scx_cpu_release_args args = {
+				.reason = preempt_reason_from_class(active),
+				.task = task,
+			};
+
+			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
+				    cpu_release, cpu_of(rq), &args);
+		}
+		rq->scx.cpu_released = true;
+	}
+}
+
 #ifdef CONFIG_SMP
 
 static bool test_and_clear_cpu_idle(int cpu)
@@ -2657,6 +2731,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		static_branch_disable_cpuslocked(&scx_has_op[i]);
 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
+	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	synchronize_rcu();
 
@@ -2863,6 +2938,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	if (ops->flags & SCX_OPS_ENQ_EXITING)
 		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
+	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
+		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
 
 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
 		reset_idle_masks();
@@ -3526,6 +3603,56 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 	.set			= &scx_kfunc_ids_dispatch,
 };
 
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ */
+u32 scx_bpf_reenqueue_local(void)
+{
+	u32 nr_enqueued, i;
+	struct rq *rq;
+	struct scx_rq *scx_rq;
+
+	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+		return 0;
+
+	rq = cpu_rq(smp_processor_id());
+	lockdep_assert_rq_held(rq);
+	scx_rq = &rq->scx;
+
+	/*
+	 * Get the number of tasks on the local DSQ before iterating over it to
+	 * pull off tasks. The enqueue callback below can signal that it wants
+	 * the task to stay on the local DSQ, and we want to prevent the BPF
+	 * scheduler from causing us to loop indefinitely.
+	 */
+	nr_enqueued = scx_rq->local_dsq.nr;
+	for (i = 0; i < nr_enqueued; i++) {
+		struct task_struct *p;
+
+		p = first_local_task(rq);
+		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+		WARN_ON_ONCE(p->scx.holding_cpu != -1);
+		dispatch_dequeue(scx_rq, p);
+		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+	}
+
+	return nr_enqueued;
+}
+
+BTF_SET8_START(scx_kfunc_ids_cpu_release)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_SET8_END(scx_kfunc_ids_cpu_release)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_cpu_release,
+};
+
 /**
  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
  * @cpu: cpu to kick
@@ -3862,6 +3989,8 @@ static int __init register_ext_kfuncs(void)
 					     &scx_kfunc_set_enqueue_dispatch)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_dispatch)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_cpu_release)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_any))) {
 		pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index fc27b28acfdea..4b22219c0dd9c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -27,6 +27,17 @@ enum scx_enq_flags {
 	 */
 	SCX_ENQ_PREEMPT		= 1LLU << 32,
 
+	/*
+	 * The task being enqueued was previously enqueued on the current CPU's
+	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+	 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+	 * invoked in a ->cpu_release() callback, and the task is again
+	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+	 * task will not be scheduled on the CPU until at least the next invocation
+	 * of the ->cpu_acquire() callback.
+	 */
+	SCX_ENQ_REENQ		= 1LLU << 40,
+
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
@@ -82,6 +93,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
 #define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
+
 bool task_on_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
@@ -96,13 +109,17 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
 #define scx_ops_error(fmt, args...)						\
 	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
+void __scx_notify_pick_next_task(struct rq *rq,
+				 struct task_struct *p,
+				 const struct sched_class *active);
+
 static inline void scx_notify_pick_next_task(struct rq *rq,
-					     const struct task_struct *p,
+					     struct task_struct *p,
 					     const struct sched_class *active)
 {
-#ifdef CONFIG_SMP
 	if (!scx_enabled())
 		return;
+#ifdef CONFIG_SMP
 	/*
 	 * Pairs with the smp_load_acquire() issued by a CPU in
 	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
@@ -110,6 +127,9 @@ static inline void scx_notify_pick_next_task(struct rq *rq,
 	 */
 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
 #endif
+	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+		return;
+	__scx_notify_pick_next_task(rq, p, active);
 }
 
 static inline void scx_notify_sched_tick(void)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4b7f48239248b..5dabe6cff2b99 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -708,6 +708,7 @@ struct scx_rq {
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
 	u32			flags;
+	bool			cpu_released;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index b81e5a89e9221..ef55e1c1aee5c 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -68,6 +68,7 @@ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+u32 scx_bpf_reenqueue_local(void) __ksym;
 
 #define BPF_STRUCT_OPS(name, args...)						\
 SEC("struct_ops/"#name)								\
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
index e5ff39083181f..279efe58b777b 100644
--- a/tools/sched_ext/scx_example_pair.bpf.c
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -89,6 +89,28 @@
  * be resolved in the near future which should allow greatly simplifying this
  * scheduler.
  *
+ * Dealing with preemption
+ * -----------------------
+ *
+ * SCX is the lowest priority sched_class, and could be preempted by them at
+ * any time. To address this, the scheduler implements pair_cpu_release() and
+ * pair_cpu_acquire() callbacks which are invoked by the core scheduler when
+ * the scheduler loses and gains control of the CPU respectively.
+ *
+ * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
+ * then invoke:
+ *
+ * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+ *
+ * This preempts the pair CPU, and waits until it has re-entered the scheduler
+ * before returning. This is necessary to ensure that the higher priority
+ * sched_class that preempted our scheduler does not schedule a task
+ * concurrently with our pair CPU.
+ *
+ * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
+ * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
+ * pair scheduling.
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
@@ -129,6 +151,12 @@ struct pair_ctx {
 
 	/* the CPUs that are currently active on the cgroup */
 	u32			active_mask;
+
+	/*
+	 * the CPUs that are currently preempted and running tasks in a
+	 * different scheduler.
+	 */
+	u32			preempted_mask;
 };
 
 struct {
@@ -295,7 +323,7 @@ static int try_dispatch(s32 cpu)
 	struct task_struct *p;
 	u64 now = bpf_ktime_get_ns();
 	bool kick_pair = false;
-	bool expired;
+	bool expired, pair_preempted;
 	u32 *vptr, in_pair_mask;
 	s32 pid, q_idx;
 	u64 cgid;
@@ -324,10 +352,14 @@ static int try_dispatch(s32 cpu)
 		 */
 		pairc->draining = true;
 
-		if (pairc->active_mask) {
+		pair_preempted = pairc->preempted_mask;
+		if (pairc->active_mask || pair_preempted) {
 			/*
-			 * The other CPU is still active We want to wait until
-			 * this cgroup expires.
+			 * The other CPU is still active, or is no longer under
+			 * our control due to e.g. being preempted by a higher
+			 * priority sched_class. We want to wait until this
+			 * cgroup expires, or until control of our pair CPU has
+			 * been returned to us.
 			 *
 			 * If the pair controls its CPU, and the time already
 			 * expired, kick.  When the other CPU arrives at
@@ -336,7 +368,7 @@ static int try_dispatch(s32 cpu)
 			 */
 			__sync_fetch_and_add(&nr_exp_waits, 1);
 			bpf_spin_unlock(&pairc->lock);
-			if (expired)
+			if (expired && !pair_preempted)
 				kick_pair = true;
 			goto out_maybe_kick;
 		}
@@ -475,6 +507,63 @@ void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask &= ~in_pair_mask;
+	/* Kick the pair CPU, unless it was also preempted. */
+	kick_pair = !pairc->preempted_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+}
+
+void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask |= in_pair_mask;
+	pairc->active_mask &= ~in_pair_mask;
+	/* Kick the pair CPU if it's still running. */
+	kick_pair = pairc->active_mask;
+	pairc->draining = true;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+		}
+	}
+	__sync_fetch_and_add(&nr_preemptions, 1);
+}
+
 s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
 {
 	u64 cgid = cgrp->kn->id;
@@ -528,6 +617,8 @@ SEC(".struct_ops")
 struct sched_ext_ops pair_ops = {
 	.enqueue		= (void *)pair_enqueue,
 	.dispatch		= (void *)pair_dispatch,
+	.cpu_acquire		= (void *)pair_cpu_acquire,
+	.cpu_release		= (void *)pair_cpu_release,
 	.cgroup_init		= (void *)pair_cgroup_init,
 	.cgroup_exit		= (void *)pair_cgroup_exit,
 	.init			= (void *)pair_init,
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index ed704a4024c0d..88e69b9670040 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -11,6 +11,8 @@
  *
  * - BPF-side queueing using PIDs.
  * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking
+ *   the CPU away.
  *
  * This scheduler is primarily for demonstration and testing of sched_ext
  * features and unlikely to be useful for actual workloads.
@@ -81,7 +83,7 @@ struct {
 } dispatch_idx_cnt SEC(".maps");
 
 /* Statistics */
-unsigned long nr_enqueued, nr_dispatched, nr_dequeued;
+unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -155,6 +157,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * If the task was re-enqueued due to the CPU being preempted by a
+	 * higher priority scheduling class, just re-enqueue the task directly
+	 * on the global DSQ. As we want another CPU to pick it up, find and
+	 * kick an idle CPU.
+	 */
+	if (enq_flags & SCX_ENQ_REENQ) {
+		s32 cpu;
+
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		if (cpu >= 0)
+			scx_bpf_kick_cpu(cpu, 0);
+		return;
+	}
+
 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
 	if (!ring) {
 		scx_bpf_error("failed to find ring %d", idx);
@@ -240,6 +258,22 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	u32 cnt;
+
+	/*
+	 * Called when @cpu is taken by a higher priority scheduling class. This
+	 * makes @cpu no longer available for executing sched_ext tasks. As we
+	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
+	 * becomes available again, re-enqueue them into the global dsq. See
+	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
+	 */
+	cnt = scx_bpf_reenqueue_local();
+	if (cnt)
+		__sync_fetch_and_add(&nr_reenqueued, cnt);
+}
+
 s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
@@ -275,6 +309,7 @@ struct sched_ext_ops qmap_ops = {
 	.enqueue		= (void *)qmap_enqueue,
 	.dequeue		= (void *)qmap_dequeue,
 	.dispatch		= (void *)qmap_dispatch,
+	.cpu_release		= (void *)qmap_cpu_release,
 	.prep_enable		= (void *)qmap_prep_enable,
 	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 3f68dae47bd06..2ae3794c9ea82 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -92,9 +92,9 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("enq=%lu, dsp=%lu, delta=%ld, deq=%lu\n",
+		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_dequeued);
+		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
 		fflush(stdout);
 		sleep(1);
 	}

From 049c8bb58f465c5f00d4e23c9d0c2e7a107f4f59 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:08 -1000
Subject: [PATCH 027/304] sched_ext: Implement
 sched_ext_ops.cpu_online/offline()

Add ops.cpu_online/offline() which are invoked when CPUs come online and
offline respectively. As the enqueue path already automatically bypasses
tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed
to see tasks only on CPUs which are between online() and offline().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 18 ++++++++++++++++++
 kernel/sched/ext.c        | 18 +++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 826da32e29ba0..63a011860f590 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -378,6 +378,24 @@ struct sched_ext_ops {
 	 */
 	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
 
+	/**
+	 * cpu_online - A CPU became online
+	 * @cpu: CPU which just came up
+	 *
+	 * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
+	 * associated with other CPUs beforehand.
+	 */
+	void (*cpu_online)(s32 cpu);
+
+	/**
+	 * cpu_offline - A CPU is going offline
+	 * @cpu: CPU which is going offline
+	 *
+	 * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
+	 * associated with other CPUs afterwards.
+	 */
+	void (*cpu_offline)(s32 cpu);
+
 	/**
 	 * prep_enable - Prepare to enable BPF scheduling for a task
 	 * @p: task to prepare BPF scheduling for
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fb99c57184e1c..d675d8d1c13c5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1393,7 +1393,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		 * emitted in scx_notify_pick_next_task().
 		 */
 		if (SCX_HAS_OP(cpu_acquire))
-			SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+			SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq),
+				    NULL);
 		rq->scx.cpu_released = false;
 	}
 
@@ -1824,6 +1825,18 @@ void __scx_update_idle(struct rq *rq, bool idle)
 	}
 }
 
+static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
+{
+	if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG)
+		SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq));
+}
+
+static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
+{
+	if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG)
+		SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq));
+}
+
 #else /* !CONFIG_SMP */
 
 static bool test_and_clear_cpu_idle(int cpu) { return false; }
@@ -2329,6 +2342,9 @@ DEFINE_SCHED_CLASS(ext) = {
 	.balance		= balance_scx,
 	.select_task_rq		= select_task_rq_scx,
 	.set_cpus_allowed	= set_cpus_allowed_scx,
+
+	.rq_online		= rq_online_scx,
+	.rq_offline		= rq_offline_scx,
 #endif
 
 	.task_tick		= task_tick_scx,

From 6399c79601d8be4f71e9d74972afccbc0c809bf4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:08 -1000
Subject: [PATCH 028/304] sched_ext: Implement core-sched support

The core-sched support is composed of the following parts:

* task_struct->scx.core_sched_at is added. This is a timestamp which can be
  used to order tasks. Depending on whether the BPF scheduler implements
  custom ordering, it tracks either global FIFO ordering of all tasks or
  local-DSQ ordering within the dispatched tasks on a CPU.

* prio_less() is updated to call scx_prio_less() when comparing SCX tasks.
  scx_prio_less() calls ops.core_sched_before() if available or uses the
  core_sched_at timestamp. For global FIFO ordering, the BPF scheduler
  doesn't need to do anything. Otherwise, it should implement
  ops.core_sched_before() which reflects the ordering.

* When core-sched is enabled, balance_scx() balances all SMT siblings so
  that they all have tasks dispatched if necessary before pick_task_scx() is
  called. pick_task_scx() picks between the current task and the first
  dispatched task on the local DSQ based on availability and the
  core_sched_at timestamps. Note that FIFO ordering is expected among the
  already dispatched tasks whether running or on the local DSQ, so this path
  always compares core_sched_at instead of calling into
  ops.core_sched_before().

qmap_core_sched_before() is added to scx_example_qmap. It scales the
distances from the heads of the queues to compare the tasks across different
priority queues and seems to behave as expected.

v2: * Sched core added the const qualifiers to prio_less task arguments.
      Explicitly drop them for ops.core_sched_before() task arguments. BPF
      enforces access control through the verifier, so the qualifier isn't
      actually operative and only gets in the way when interacting with
      various helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Reviewed-by: Josh Don <joshdon@google.com>
---
 include/linux/sched/ext.h              |  21 +++
 kernel/Kconfig.preempt                 |   2 +-
 kernel/sched/core.c                    |  12 +-
 kernel/sched/ext.c                     | 219 +++++++++++++++++++++++--
 kernel/sched/ext.h                     |  13 ++
 tools/sched_ext/scx_example_qmap.bpf.c |  87 +++++++++-
 tools/sched_ext/scx_example_qmap.c     |   5 +-
 7 files changed, 342 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 63a011860f590..210b8516d1979 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -316,6 +316,24 @@ struct sched_ext_ops {
 	 */
 	bool (*yield)(struct task_struct *from, struct task_struct *to);
 
+	/**
+	 * core_sched_before - Task ordering for core-sched
+	 * @a: task A
+	 * @b: task B
+	 *
+	 * Used by core-sched to determine the ordering between two tasks. See
+	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+	 * core-sched.
+	 *
+	 * Both @a and @b are runnable and may or may not currently be queued on
+	 * the BPF scheduler. Should return %true if @a should run before @b.
+	 * %false if there's no required ordering or @b should run before @a.
+	 *
+	 * If not specified, the default is ordering them according to when they
+	 * became runnable.
+	 */
+	bool (*core_sched_before)(struct task_struct *a,struct task_struct *b);
+
 	/**
 	 * set_weight - Set task weight
 	 * @p: task to set weight for
@@ -628,6 +646,9 @@ struct sched_ext_entity {
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 	atomic64_t		ops_state;
 	unsigned long		runnable_at;
+#ifdef CONFIG_SCHED_CORE
+	u64			core_sched_at;	/* see scx_prio_less() */
+#endif
 
 	/* BPF scheduler modifiable fields */
 
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0afcda19bc50c..e12a057ead7b6 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -135,7 +135,7 @@ config SCHED_CORE
 
 config SCHED_CLASS_EXT
 	bool "Extensible Scheduling Class"
-	depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
+	depends on BPF_SYSCALL && BPF_JIT
 	help
 	  This option enables a new scheduler class sched_ext (SCX), which
 	  allows scheduling policies to be implemented as BPF programs to
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ff51977968fb0..146d736fe73b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -163,7 +163,12 @@ static inline int __task_prio(const struct task_struct *p)
 	if (p->sched_class == &idle_sched_class)
 		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
 
-	return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+#ifdef CONFIG_SCHED_CLASS_EXT
+	if (p->sched_class == &ext_sched_class)
+		return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
+#endif
+
+	return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
 }
 
 /*
@@ -192,6 +197,11 @@ static inline bool prio_less(const struct task_struct *a,
 	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
 		return cfs_prio_less(a, b, in_fi);
 
+#ifdef CONFIG_SCHED_CLASS_EXT
+	if (pa == MAX_RT_PRIO + MAX_NICE + 1)	/* ext */
+		return scx_prio_less(a, b, in_fi);
+#endif
+
 	return false;
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d675d8d1c13c5..195ebfa0e67e9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -530,6 +530,49 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
 	return -EPROTO;
 }
 
+/**
+ * touch_core_sched - Update timestamp used for core-sched task ordering
+ * @rq: rq to read clock from, must be locked
+ * @p: task to update the timestamp for
+ *
+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
+ * exhaustion).
+ */
+static void touch_core_sched(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_CORE
+	/*
+	 * It's okay to update the timestamp spuriously. Use
+	 * sched_core_disabled() which is cheaper than enabled().
+	 */
+	if (!sched_core_disabled())
+		p->scx.core_sched_at = rq_clock_task(rq);
+#endif
+}
+
+/**
+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
+ * @rq: rq to read clock from, must be locked
+ * @p: task being dispatched
+ *
+ * If the BPF scheduler implements custom core-sched ordering via
+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
+ * ordering within each local DSQ. This function is called from dispatch paths
+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
+ */
+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_rq_held(rq);
+	assert_clock_updated(rq);
+
+#ifdef CONFIG_SCHED_CORE
+	if (SCX_HAS_OP(core_sched_before))
+		touch_core_sched(rq, p);
+#endif
+}
+
 static void update_curr_scx(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
@@ -545,8 +588,11 @@ static void update_curr_scx(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
 
-	if (curr->scx.slice != SCX_SLICE_INF)
+	if (curr->scx.slice != SCX_SLICE_INF) {
 		curr->scx.slice -= min(curr->scx.slice, delta_exec);
+		if (!curr->scx.slice)
+			touch_core_sched(rq, curr);
+	}
 }
 
 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -702,6 +748,8 @@ static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p
 		return;
 	}
 
+	touch_core_sched_dispatch(task_rq(p), p);
+
 	dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
 	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
 
@@ -785,12 +833,19 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	return;
 
 local:
+	/*
+	 * For task-ordering, slice refill must be treated as implying the end
+	 * of the current slice. Otherwise, the longer @p stays on the CPU, the
+	 * higher priority it becomes from scx_prio_less()'s POV.
+	 */
+	touch_core_sched(rq, p);
 	p->scx.slice = SCX_SLICE_DFL;
 local_norefill:
 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
 	return;
 
 global:
+	touch_core_sched(rq, p);	/* see the comment in local: */
 	p->scx.slice = SCX_SLICE_DFL;
 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
 }
@@ -847,6 +902,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	if (SCX_HAS_OP(runnable))
 		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
 
+	if (enq_flags & SCX_ENQ_WAKEUP)
+		touch_core_sched(rq, p);
+
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 }
 
@@ -1297,6 +1355,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
 	struct scx_dispatch_q *dsq;
 	u64 opss;
 
+	touch_core_sched_dispatch(rq, p);
 retry:
 	/*
 	 * No need for _acquire here. @p is accessed only after a successful
@@ -1374,8 +1433,8 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
 	dspc->buf_cursor = 0;
 }
 
-static int balance_scx(struct rq *rq, struct task_struct *prev,
-		       struct rq_flags *rf)
+static int balance_one(struct rq *rq, struct task_struct *prev,
+		       struct rq_flags *rf, bool local)
 {
 	struct scx_rq *scx_rq = &rq->scx;
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
@@ -1399,7 +1458,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 	}
 
 	if (prev_on_scx) {
-		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
+		WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
 		update_curr_scx(rq);
 
 		/*
@@ -1411,10 +1470,16 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * disabling() test.
+		 *
+		 * When balancing a remote CPU for core-sched, there won't be a
+		 * following put_prev_task_scx() call and we don't own
+		 * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
+		 * same conditions later and pick @rq->curr accordingly.
 		 */
 		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
 		    prev->scx.slice && !scx_ops_disabling()) {
-			prev->scx.flags |= SCX_TASK_BAL_KEEP;
+			if (local)
+				prev->scx.flags |= SCX_TASK_BAL_KEEP;
 			return 1;
 		}
 	}
@@ -1470,10 +1535,55 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 	return 0;
 }
 
+static int balance_scx(struct rq *rq, struct task_struct *prev,
+		       struct rq_flags *rf)
+{
+	int ret;
+
+	ret = balance_one(rq, prev, rf, true);
+
+	/*
+	 * When core-sched is enabled, this ops.balance() call will be followed
+	 * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
+	 * on the SMT siblings. Balance the siblings too.
+	 */
+	if (sched_core_enabled(rq)) {
+		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+		int scpu;
+
+		for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
+			struct rq *srq = cpu_rq(scpu);
+			struct rq_flags srf;
+			struct task_struct *sprev = srq->curr;
+
+			/*
+			 * While core-scheduling, rq lock is shared among
+			 * siblings but the debug annotations and rq clock
+			 * aren't. Do pinning dance to transfer the ownership.
+			 */
+			WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
+			rq_unpin_lock(rq, rf);
+			rq_pin_lock(srq, &srf);
+
+			update_rq_clock(srq);
+			balance_one(srq, sprev, &srf, false);
+
+			rq_unpin_lock(srq, &srf);
+			rq_repin_lock(rq, rf);
+		}
+	}
+
+	return ret;
+}
+
 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 {
 	if (p->scx.flags & SCX_TASK_QUEUED) {
-		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		/*
+		 * Core-sched might decide to execute @p before it is
+		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
+		 */
+		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
 		dispatch_dequeue(&rq->scx, p);
 	}
 
@@ -1556,7 +1666,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 		/*
 		 * If @p has slice left and balance_scx() didn't tag it for
 		 * keeping, @p is getting preempted by a higher priority
-		 * scheduler class. Leave it at the head of the local DSQ.
+		 * scheduler class or core-sched forcing a different task. Leave
+		 * it at the head of the local DSQ.
 		 */
 		if (p->scx.slice && !scx_ops_disabling()) {
 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
@@ -1613,6 +1724,84 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
 	return p;
 }
 
+#ifdef CONFIG_SCHED_CORE
+/**
+ * scx_prio_less - Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Core-sched is implemented as an additional scheduling layer on top of the
+ * usual sched_class'es and needs to find out the expected task ordering. For
+ * SCX, core-sched calls this function to interrogate the task ordering.
+ *
+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
+ * to implement the default task ordering. The older the timestamp, the higher
+ * prority the task - the global FIFO ordering matching the default scheduling
+ * behavior.
+ *
+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
+ */
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+		   bool in_fi)
+{
+	/*
+	 * The const qualifiers are dropped from task_struct pointers when
+	 * calling ops.core_sched_before(). Accesses are controlled by the
+	 * verifier.
+	 */
+	if (SCX_HAS_OP(core_sched_before) && !scx_ops_disabling())
+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+					      (struct task_struct *)a,
+					      (struct task_struct *)b);
+	else
+		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
+}
+
+/**
+ * pick_task_scx - Pick a candidate task for core-sched
+ * @rq: rq to pick the candidate task from
+ *
+ * Core-sched calls this function on each SMT sibling to determine the next
+ * tasks to run on the SMT siblings. balance_one() has been called on all
+ * siblings and put_prev_task_scx() has been called only for the current CPU.
+ *
+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
+ * at the first task in the local dsq. @rq->curr has to be considered explicitly
+ * to mimic %SCX_TASK_BAL_KEEP.
+ */
+static struct task_struct *pick_task_scx(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct task_struct *first = first_local_task(rq);
+
+	if (curr->scx.flags & SCX_TASK_QUEUED) {
+		/* is curr the only runnable task? */
+		if (!first)
+			return curr;
+
+		/*
+		 * Does curr trump first? We can always go by core_sched_at for
+		 * this comparison as it represents global FIFO ordering when
+		 * the default core-sched ordering is used and local-DSQ FIFO
+		 * ordering otherwise.
+		 *
+		 * We can have a task with an earlier timestamp on the DSQ. For
+		 * example, when a current task is preempted by a sibling
+		 * picking a different cookie, the task would be requeued at the
+		 * head of the local DSQ with an earlier timestamp than the
+		 * core-sched picked next task. Besides, the BPF scheduler may
+		 * dispatch any tasks to the local DSQ anytime.
+		 */
+		if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
+						     first->scx.core_sched_at))
+			return curr;
+	}
+
+	return first;	/* this may be %NULL */
+}
+#endif	/* CONFIG_SCHED_CORE */
+
 static enum scx_cpu_preempt_reason
 preempt_reason_from_class(const struct sched_class *class)
 {
@@ -1893,11 +2082,13 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 	update_curr_scx(rq);
 
 	/*
-	 * While disabling, always resched as we can't trust the slice
-	 * management.
+	 * While disabling, always resched and refresh core-sched timestamp as
+	 * we can't trust the slice management or ops.core_sched_before().
 	 */
-	if (scx_ops_disabling())
+	if (scx_ops_disabling()) {
 		curr->scx.slice = 0;
+		touch_core_sched(rq, curr);
+	}
 
 	if (!curr->scx.slice)
 		resched_curr(rq);
@@ -2347,6 +2538,10 @@ DEFINE_SCHED_CLASS(ext) = {
 	.rq_offline		= rq_offline_scx,
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+	.pick_task		= pick_task_scx,
+#endif
+
 	.task_tick		= task_tick_scx,
 
 	.switching_to		= switching_to_scx,
@@ -2675,9 +2870,11 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	 *
 	 * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value
 	 *    can't be trusted. Whenever a tick triggers, the running task is
-	 *    rotated to the tail of the queue.
+	 *    rotated to the tail of the queue with core_sched_at touched.
 	 *
 	 * c. pick_next_task() suppresses zero slice warning.
+	 *
+	 * d. scx_prio_less() reverts to the default core_sched_at order.
 	 */
 	scx_ops.enqueue = scx_ops_fallback_enqueue;
 	scx_ops.dispatch = scx_ops_fallback_dispatch;
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 4b22219c0dd9c..7b7973e6d8c05 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -68,6 +68,14 @@ enum scx_enq_flags {
 enum scx_deq_flags {
 	/* expose select DEQUEUE_* flags as enums */
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
+
+	/* high 32bits are SCX specific */
+
+	/*
+	 * The generic core-sched layer decided to execute the task even though
+	 * it hasn't been dispatched yet. Dequeue from the BPF side.
+	 */
+	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
 };
 
 enum scx_tg_flags {
@@ -173,6 +181,11 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 	for_active_class_range(class, (prev_class) > &ext_sched_class ?		\
 			       &ext_sched_class : (prev_class), (end_class))
 
+#ifdef CONFIG_SCHED_CORE
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+		   bool in_fi);
+#endif
+
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 88e69b9670040..579ab21ae4036 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -13,6 +13,7 @@
  * - Sleepable per-task storage allocation using ops.prep_enable().
  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
  *   the CPU away.
+ * - Core-sched support.
  *
  * This scheduler is primarily for demonstration and testing of sched_ext
  * features and unlikely to be useful for actual workloads.
@@ -62,9 +63,21 @@ struct {
 	},
 };
 
+/*
+ * Per-queue sequence numbers to implement core-sched ordering.
+ *
+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the
+ * sequence number of the latest dispatched task. The distance between the a
+ * task's seq and the associated queue's head seq is called the queue distance
+ * and used when comparing two tasks for ordering. See qmap_core_sched_before().
+ */
+static u64 core_sched_head_seqs[5];
+static u64 core_sched_tail_seqs[5];
+
 /* Per-task scheduling context */
 struct task_ctx {
 	bool	force_local;	/* Dispatch directly to local_dsq */
+	u64	core_sched_seq;
 };
 
 struct {
@@ -84,6 +97,7 @@ struct {
 
 /* Statistics */
 unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
+unsigned long nr_core_sched_execed;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -150,8 +164,18 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
-	/* Is select_cpu() is telling us to enqueue locally? */
-	if (tctx->force_local) {
+	/*
+	 * All enqueued tasks must have their core_sched_seq updated for correct
+	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
+	 * qmap_ops.flags.
+	 */
+	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+
+	/*
+	 * If qmap_select_cpu() is telling us to or this is the last runnable
+	 * task on the CPU, enqueue locally.
+	 */
+	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
 		tctx->force_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
@@ -195,6 +219,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
 	__sync_fetch_and_add(&nr_dequeued, 1);
+	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
+		__sync_fetch_and_add(&nr_core_sched_execed, 1);
+}
+
+static void update_core_sched_head_seq(struct task_struct *p)
+{
+	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	int idx = weight_to_idx(p->scx.weight);
+
+	if (tctx)
+		core_sched_head_seqs[idx] = tctx->core_sched_seq;
+	else
+		scx_bpf_error("task_ctx lookup failed");
 }
 
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
@@ -247,6 +284,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 			p = bpf_task_from_pid(pid);
 			if (p) {
+				update_core_sched_head_seq(p);
 				__sync_fetch_and_add(&nr_dispatched, 1);
 				scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
 				bpf_task_release(p);
@@ -258,6 +296,49 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+/*
+ * The distance from the head of the queue scaled by the weight of the queue.
+ * The lower the number, the older the task and the higher the priority.
+ */
+static s64 task_qdist(struct task_struct *p)
+{
+	int idx = weight_to_idx(p->scx.weight);
+	struct task_ctx *tctx;
+	s64 qdist;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return 0;
+	}
+
+	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+
+	/*
+	 * As queue index increments, the priority doubles. The queue w/ index 3
+	 * is dispatched twice more frequently than 2. Reflect the difference by
+	 * scaling qdists accordingly. Note that the shift amount needs to be
+	 * flipped depending on the sign to avoid flipping priority direction.
+	 */
+	if (qdist >= 0)
+		return qdist << (4 - idx);
+	else
+		return qdist << idx;
+}
+
+/*
+ * This is called to determine the task ordering when core-sched is picking
+ * tasks to execute on SMT siblings and should encode about the same ordering as
+ * the regular scheduling path. Use the priority-scaled distances from the head
+ * of the queues to compare the two tasks which should be consistent with the
+ * dispatch path behavior.
+ */
+bool BPF_STRUCT_OPS(qmap_core_sched_before,
+		    struct task_struct *a, struct task_struct *b)
+{
+	return task_qdist(a) > task_qdist(b);
+}
+
 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
 {
 	u32 cnt;
@@ -309,10 +390,12 @@ struct sched_ext_ops qmap_ops = {
 	.enqueue		= (void *)qmap_enqueue,
 	.dequeue		= (void *)qmap_dequeue,
 	.dispatch		= (void *)qmap_dispatch,
+	.core_sched_before	= (void *)qmap_core_sched_before,
 	.cpu_release		= (void *)qmap_cpu_release,
 	.prep_enable		= (void *)qmap_prep_enable,
 	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
+	.flags			= SCX_OPS_ENQ_LAST,
 	.timeout_ms		= 5000U,
 	.name			= "qmap",
 };
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 2ae3794c9ea82..ccb4814ee61ba 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -92,9 +92,10 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu\n",
+		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu, core=%lu\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
+		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
+		       skel->bss->nr_core_sched_execed);
 		fflush(stdout);
 		sleep(1);
 	}

From 666284f73c4660896ea20c1451039282cad0bbbd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:08 -1000
Subject: [PATCH 029/304] sched_ext: Add vtime-ordered priority queue to
 dispatch_q's

Currently, a dsq is always a FIFO. A task which is dispatched earlier gets
consumed or executed earlier. While this is sufficient when dsq's are used
for simple staging areas for tasks which are ready to execute, it'd make
dsq's a lot more useful if they can implement custom ordering.

This patch adds a vtime-ordered priority queue to dsq's. When the BPF
scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it
can specify the vtime tha the task should be inserted at and the task is
inserted into the priority queue in the dsq which is ordered according to
time_before64() comparison of the vtime values. When executing or consuming
the dsq, the FIFO is always processed first and the priority queue is
processed iff the FIFO is empty.

The design decision was made to allow both FIFO and priority queue to be
available at the same timeq for all dsq's for three reasons. First, the new
priority queue is useful for the local dsq's too but they also need the FIFO
when consuming tasks from other dsq's as the vtimes may not be comparable
across them. Second, the interface surface is smaller this way - the only
additional interface necessary is scx_bpf_dispsatch_vtime(). Third, the
overhead isn't meaningfully different whether they're available at the same
time or not.

This makes it very easy for the BPF schedulers to implement proper vtime
based scheduling within each dsq very easy and efficient at a negligible
cost in terms of code complexity and overhead.

scx_example_simple and scx_example_flatcg are updated to default to weighted
vtime scheduling (the latter within each cgroup). FIFO scheduling can be
selected with -f option.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 include/linux/sched/ext.h                |  16 ++-
 init/init_task.c                         |   2 +-
 kernel/sched/core.c                      |   3 +-
 kernel/sched/ext.c                       | 137 ++++++++++++++++++++---
 kernel/sched/ext.h                       |   1 +
 tools/sched_ext/scx_common.bpf.h         |   1 +
 tools/sched_ext/scx_example_flatcg.bpf.c |  50 ++++++++-
 tools/sched_ext/scx_example_flatcg.c     |   6 +-
 tools/sched_ext/scx_example_simple.bpf.c |  66 ++++++++++-
 tools/sched_ext/scx_example_simple.c     |   8 +-
 10 files changed, 264 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 210b8516d1979..fe2b051230b28 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -583,6 +583,7 @@ struct sched_ext_ops {
 struct scx_dispatch_q {
 	raw_spinlock_t		lock;
 	struct list_head	fifo;	/* processed in dispatching order */
+	struct rb_root_cached	priq;	/* processed in p->scx.dsq_vtime order */
 	u32			nr;
 	u64			id;
 	struct rhash_head	hash_node;
@@ -595,6 +596,7 @@ enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
 	SCX_TASK_ENQ_LOCAL	= 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
+	SCX_TASK_ON_DSQ_PRIQ	= 1 << 3, /* task is queued on the priority queue of a dsq */
 
 	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
 	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
@@ -636,7 +638,10 @@ enum scx_kf_mask {
  */
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
-	struct list_head	dsq_node;
+	struct {
+		struct list_head	fifo;	/* dispatch order */
+		struct rb_node		priq;	/* p->scx.dsq_vtime order */
+	} dsq_node;
 	struct list_head	watchdog_node;
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
@@ -664,6 +669,15 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * Used to order tasks when dispatching to the vtime-ordered priority
+	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
+	 * but can also be modified directly by the BPF scheduler. Modifying it
+	 * while a task is queued on a dsq may mangle the ordering and is not
+	 * recommended.
+	 */
+	u64			dsq_vtime;
+
 	/*
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.
diff --git a/init/init_task.c b/init/init_task.c
index 913194aab6232..7ea89ccd0cf12 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -105,7 +105,7 @@ struct task_struct init_task
 #endif
 #ifdef CONFIG_SCHED_CLASS_EXT
 	.scx		= {
-		.dsq_node	= LIST_HEAD_INIT(init_task.scx.dsq_node),
+		.dsq_node.fifo	= LIST_HEAD_INIT(init_task.scx.dsq_node.fifo),
 		.watchdog_node	= LIST_HEAD_INIT(init_task.scx.watchdog_node),
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 146d736fe73b3..b33389e17765f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4495,7 +4495,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 	p->scx.dsq		= NULL;
-	INIT_LIST_HEAD(&p->scx.dsq_node);
+	INIT_LIST_HEAD(&p->scx.dsq_node.fifo);
+	RB_CLEAR_NODE(&p->scx.dsq_node.priq);
 	INIT_LIST_HEAD(&p->scx.watchdog_node);
 	p->scx.flags		= 0;
 	p->scx.weight		= 0;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 195ebfa0e67e9..3515a3fba2837 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -595,12 +595,25 @@ static void update_curr_scx(struct rq *rq)
 	}
 }
 
+static bool scx_dsq_priq_less(struct rb_node *node_a,
+			      const struct rb_node *node_b)
+{
+	const struct task_struct *a =
+		container_of(node_a, struct task_struct, scx.dsq_node.priq);
+	const struct task_struct *b =
+		container_of(node_b, struct task_struct, scx.dsq_node.priq);
+
+	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
+}
+
 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 			     u64 enq_flags)
 {
 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
 
-	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
+	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
+	WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
+		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
 
 	if (!is_local) {
 		raw_spin_lock(&dsq->lock);
@@ -613,10 +626,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		}
 	}
 
-	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
-		list_add(&p->scx.dsq_node, &dsq->fifo);
-	else
-		list_add_tail(&p->scx.dsq_node, &dsq->fifo);
+	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+		p->scx.flags |= SCX_TASK_ON_DSQ_PRIQ;
+		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
+			      scx_dsq_priq_less);
+	} else {
+		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+			list_add(&p->scx.dsq_node.fifo, &dsq->fifo);
+		else
+			list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo);
+	}
 	dsq->nr++;
 	p->scx.dsq = dsq;
 
@@ -645,13 +664,31 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	}
 }
 
+static void task_unlink_from_dsq(struct task_struct *p,
+				 struct scx_dispatch_q *dsq)
+{
+	if (p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) {
+		rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
+		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
+		p->scx.flags &= ~SCX_TASK_ON_DSQ_PRIQ;
+	} else {
+		list_del_init(&p->scx.dsq_node.fifo);
+	}
+}
+
+static bool task_linked_on_dsq(struct task_struct *p)
+{
+	return !list_empty(&p->scx.dsq_node.fifo) ||
+		!RB_EMPTY_NODE(&p->scx.dsq_node.priq);
+}
+
 static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
 {
 	struct scx_dispatch_q *dsq = p->scx.dsq;
 	bool is_local = dsq == &scx_rq->local_dsq;
 
 	if (!dsq) {
-		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+		WARN_ON_ONCE(task_linked_on_dsq(p));
 		/*
 		 * When dispatching directly from the BPF scheduler to a local
 		 * DSQ, the task isn't associated with any DSQ but
@@ -672,8 +709,8 @@ static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
 	*/
 	if (p->scx.holding_cpu < 0) {
 		/* @p must still be on @dsq, dequeue */
-		WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
-		list_del_init(&p->scx.dsq_node);
+		WARN_ON_ONCE(!task_linked_on_dsq(p));
+		task_unlink_from_dsq(p, dsq);
 		dsq->nr--;
 	} else {
 		/*
@@ -682,7 +719,7 @@ static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
 		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
 		 * the race.
 		 */
-		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+		WARN_ON_ONCE(task_linked_on_dsq(p));
 		p->scx.holding_cpu = -1;
 	}
 	p->scx.dsq = NULL;
@@ -1146,33 +1183,52 @@ static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
 #endif	/* CONFIG_SMP */
 
 
+static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
+{
+	return likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
+		cpumask_test_cpu(cpu_of(rq), p->cpus_ptr);
+}
+
 static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
 			       struct scx_dispatch_q *dsq)
 {
 	struct scx_rq *scx_rq = &rq->scx;
 	struct task_struct *p;
+	struct rb_node *rb_node;
 	struct rq *task_rq;
 	bool moved = false;
 retry:
-	if (list_empty(&dsq->fifo))
+	if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq))
 		return false;
 
 	raw_spin_lock(&dsq->lock);
-	list_for_each_entry(p, &dsq->fifo, scx.dsq_node) {
+
+	list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) {
+		task_rq = task_rq(p);
+		if (rq == task_rq)
+			goto this_rq;
+		if (task_can_run_on_rq(p, rq))
+			goto remote_rq;
+	}
+
+	for (rb_node = rb_first_cached(&dsq->priq); rb_node;
+	     rb_node = rb_next(rb_node)) {
+		p = container_of(rb_node, struct task_struct, scx.dsq_node.priq);
 		task_rq = task_rq(p);
 		if (rq == task_rq)
 			goto this_rq;
-		if (likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
-		    cpumask_test_cpu(cpu_of(rq), p->cpus_ptr))
+		if (task_can_run_on_rq(p, rq))
 			goto remote_rq;
 	}
+
 	raw_spin_unlock(&dsq->lock);
 	return false;
 
 this_rq:
 	/* @dsq is locked and @p is on this rq */
 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-	list_move_tail(&p->scx.dsq_node, &scx_rq->local_dsq.fifo);
+	task_unlink_from_dsq(p, dsq);
+	list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo);
 	dsq->nr--;
 	scx_rq->local_dsq.nr++;
 	p->scx.dsq = &scx_rq->local_dsq;
@@ -1189,7 +1245,7 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
 	 * move_task_to_local_dsq().
 	 */
 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-	list_del_init(&p->scx.dsq_node);
+	task_unlink_from_dsq(p, dsq);
 	dsq->nr--;
 	p->scx.holding_cpu = raw_smp_processor_id();
 	raw_spin_unlock(&dsq->lock);
@@ -1692,8 +1748,18 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 
 static struct task_struct *first_local_task(struct rq *rq)
 {
-	return list_first_entry_or_null(&rq->scx.local_dsq.fifo,
-					struct task_struct, scx.dsq_node);
+	struct rb_node *rb_node;
+
+	if (!list_empty(&rq->scx.local_dsq.fifo))
+		return list_first_entry(&rq->scx.local_dsq.fifo,
+					struct task_struct, scx.dsq_node.fifo);
+
+	rb_node = rb_first_cached(&rq->scx.local_dsq.priq);
+	if (rb_node)
+		return container_of(rb_node,
+				    struct task_struct, scx.dsq_node.priq);
+
+	return NULL;
 }
 
 static struct task_struct *pick_next_task_scx(struct rq *rq)
@@ -3359,6 +3425,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 		if (off >= offsetof(struct task_struct, scx.slice) &&
 		    off + size <= offsetofend(struct task_struct, scx.slice))
 			return SCALAR_VALUE;
+		if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+		    off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+			return SCALAR_VALUE;
 		if (off >= offsetof(struct task_struct, scx.disallow) &&
 		    off + size <= offsetofend(struct task_struct, scx.disallow))
 			return SCALAR_VALUE;
@@ -3739,8 +3808,42 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 	scx_dispatch_commit(p, dsq_id, enq_flags);
 }
 
+/**
+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
+ * @slice: duration @p can run for in nsecs
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime and always
+ * consumed after the tasks in the FIFO queue. All other aspects are identical
+ * to scx_bpf_dispatch().
+ *
+ * @vtime ordering is according to time_before64() which considers wrapping. A
+ * numerically larger vtime may indicate an earlier position in the ordering and
+ * vice-versa.
+ */
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice,
+			    u64 vtime, u64 enq_flags)
+{
+	if (!scx_dispatch_preamble(p, enq_flags))
+		return;
+
+	if (slice)
+		p->scx.slice = slice;
+	else
+		p->scx.slice = p->scx.slice ?: 1;
+
+	p->scx.dsq_vtime = vtime;
+
+	scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
 BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
 BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 7b7973e6d8c05..b5a31fae21689 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -63,6 +63,7 @@ enum scx_enq_flags {
 	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
 
 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
+	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
 };
 
 enum scx_deq_flags {
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index ef55e1c1aee5c..e56de9dc86f28 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -57,6 +57,7 @@ s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
index 9632bab7f164c..f6078b9a681fe 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -38,6 +38,10 @@
  * this isn't a real concern especially given the performance gain. Also, there
  * are ways to mitigate the problem further by e.g. introducing an extra
  * scheduling layer on cgroup delegation boundaries.
+ *
+ * The scheduler first picks the cgroup to run and then schedule the tasks
+ * within by using nested weighted vtime scheduling by default. The
+ * cgroup-internal scheduling can be switched to FIFO with the -f option.
  */
 #include "scx_common.bpf.h"
 #include "user_exit_info.h"
@@ -47,6 +51,7 @@ char _license[] SEC("license") = "GPL";
 
 const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
 const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
+const volatile bool fifo_sched;
 const volatile bool switch_partial;
 
 u64 cvtime_now;
@@ -350,7 +355,21 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 	if (!cgc)
 		goto out_release;
 
-	scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+	if (fifo_sched) {
+		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+	} else {
+		u64 tvtime = p->scx.dsq_vtime;
+
+		/*
+		 * Limit the amount of budget that an idling task can accumulate
+		 * to one slice.
+		 */
+		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
+			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
+
+		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
+				       tvtime, enq_flags);
+	}
 
 	cgrp_enqueued(cgrp, cgc);
 out_release:
@@ -462,12 +481,40 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
 	bpf_cgroup_release(cgrp);
 }
 
+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
+{
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	if (fifo_sched)
+		return;
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgc = find_cgrp_ctx(cgrp);
+	if (cgc) {
+		/*
+		 * @cgc->tvtime_now always progresses forward as tasks start
+		 * executing. The test and update can be performed concurrently
+		 * from multiple CPUs and thus racy. Any error should be
+		 * contained and temporary. Let's just live with it.
+		 */
+		if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
+			cgc->tvtime_now = p->scx.dsq_vtime;
+	}
+	bpf_cgroup_release(cgrp);
+}
+
 void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
 {
 	struct fcg_task_ctx *taskc;
 	struct cgroup *cgrp;
 	struct fcg_cgrp_ctx *cgc;
 
+	/* scale the execution time by the inverse of the weight and charge */
+	if (!fifo_sched)
+		p->scx.dsq_vtime +=
+			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+
 	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
 	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
@@ -811,6 +858,7 @@ struct sched_ext_ops flatcg_ops = {
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
 	.runnable		= (void *)fcg_runnable,
+	.running		= (void *)fcg_running,
 	.stopping		= (void *)fcg_stopping,
 	.quiescent		= (void *)fcg_quiescent,
 	.prep_enable		= (void *)fcg_prep_enable,
diff --git a/tools/sched_ext/scx_example_flatcg.c b/tools/sched_ext/scx_example_flatcg.c
index 150f7e16996e6..f9c8a5b84a703 100644
--- a/tools/sched_ext/scx_example_flatcg.c
+++ b/tools/sched_ext/scx_example_flatcg.c
@@ -26,10 +26,11 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-i INTERVAL] [-p]\n"
+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-p]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -i INTERVAL   Report interval\n"
+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";
 
@@ -149,6 +150,9 @@ int main(int argc, char **argv)
 		case 'd':
 			dump_cgrps = true;
 			break;
+		case 'f':
+			skel->rodata->fifo_sched = true;
+			break;
 		case 'p':
 			skel->rodata->switch_partial = true;
 			break;
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
index fa5ae683ace1e..4bccca3e20470 100644
--- a/tools/sched_ext/scx_example_simple.bpf.c
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -2,11 +2,20 @@
 /*
  * A simple scheduler.
  *
- * A simple global FIFO scheduler. It also demonstrates the following niceties.
+ * By default, it operates as a simple global weighted vtime scheduler and can
+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
  *
  * - Statistics tracking how many tasks are queued to local and global dsq's.
  * - Termination notification for userspace.
  *
+ * While very simple, this scheduler should work reasonably well on CPUs with a
+ * uniform L3 cache topology. While preemption is not implemented, the fact that
+ * the scheduling queue is shared across all CPUs means that whatever is at the
+ * front of the queue is likely to be executed fairly quickly given enough
+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
+ * but comes with the usual problems with FIFO scheduling where saturating
+ * threads can easily drown out interactive ones.
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
@@ -15,8 +24,10 @@
 
 char _license[] SEC("license") = "GPL";
 
+const volatile bool fifo_sched;
 const volatile bool switch_partial;
 
+static u64 vtime_now;
 struct user_exit_info uei;
 
 struct {
@@ -33,8 +44,18 @@ static void stat_inc(u32 idx)
 		(*cnt_p)++;
 }
 
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
 void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 {
+	/*
+	 * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that
+	 * running @p on its CPU directly shouldn't affect fairness. Just queue
+	 * it on the local FIFO.
+	 */
 	if (enq_flags & SCX_ENQ_LOCAL) {
 		stat_inc(0);	/* count local queueing */
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
@@ -42,7 +63,46 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	stat_inc(1);	/* count global queueing */
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+
+	if (fifo_sched) {
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		u64 vtime = p->scx.dsq_vtime;
+
+		/*
+		 * Limit the amount of budget that an idling task can accumulate
+		 * to one slice.
+		 */
+		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+			vtime = vtime_now - SCX_SLICE_DFL;
+
+		scx_bpf_dispatch_vtime(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, vtime,
+				       enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
+{
+	if (fifo_sched)
+		return;
+
+	/*
+	 * Global vtime always progresses forward as tasks start executing. The
+	 * test and update can be performed concurrently from multiple CPUs and
+	 * thus racy. Any error should be contained and temporary. Let's just
+	 * live with it.
+	 */
+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+		vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
+{
+	if (fifo_sched)
+		return;
+
+	/* scale the execution time by the inverse of the weight and charge */
+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
 }
 
 s32 BPF_STRUCT_OPS(simple_init)
@@ -60,6 +120,8 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 SEC(".struct_ops")
 struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
+	.running		= (void *)simple_running,
+	.stopping		= (void *)simple_stopping,
 	.init			= (void *)simple_init,
 	.exit			= (void *)simple_exit,
 	.name			= "simple",
diff --git a/tools/sched_ext/scx_example_simple.c b/tools/sched_ext/scx_example_simple.c
index 868fd39e45c7e..486b401f7c951 100644
--- a/tools/sched_ext/scx_example_simple.c
+++ b/tools/sched_ext/scx_example_simple.c
@@ -19,8 +19,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-p]\n"
+"Usage: %s [-f] [-p]\n"
 "\n"
+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";
 
@@ -65,8 +66,11 @@ int main(int argc, char **argv)
 	skel = scx_example_simple__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "ph")) != -1) {
+	while ((opt = getopt(argc, argv, "fph")) != -1) {
 		switch (opt) {
+		case 'f':
+			skel->rodata->fifo_sched = true;
+			break;
 		case 'p':
 			skel->rodata->switch_partial = true;
 			break;

From d2ddca21284491eb8649370a1a4f7b08f839ad84 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Apr 2023 06:40:08 -1000
Subject: [PATCH 030/304] sched_ext: Documentation: scheduler: Document
 extensible scheduler class

Add Documentation/scheduler/sched-ext.rst which gives a high-level overview
and pointers to the examples.

v2: Apply minor edits suggested by Bagas. Caveats section dropped as all of
    them are addressed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
---
 Documentation/scheduler/index.rst     |   1 +
 Documentation/scheduler/sched-ext.rst | 230 ++++++++++++++++++++++++++
 include/linux/sched/ext.h             |   2 +
 kernel/Kconfig.preempt                |   2 +
 kernel/sched/ext.c                    |   2 +
 kernel/sched/ext.h                    |   2 +
 6 files changed, 239 insertions(+)
 create mode 100644 Documentation/scheduler/sched-ext.rst

diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
index 3170747226f6d..0b650bb550e64 100644
--- a/Documentation/scheduler/index.rst
+++ b/Documentation/scheduler/index.rst
@@ -19,6 +19,7 @@ Scheduler
     sched-nice-design
     sched-rt-group
     sched-stats
+    sched-ext
     sched-debug
 
     text_files
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
new file mode 100644
index 0000000000000..84c30b44f104c
--- /dev/null
+++ b/Documentation/scheduler/sched-ext.rst
@@ -0,0 +1,230 @@
+==========================
+Extensible Scheduler Class
+==========================
+
+sched_ext is a scheduler class whose behavior can be defined by a set of BPF
+programs - the BPF scheduler.
+
+* sched_ext exports a full scheduling interface so that any scheduling
+  algorithm can be implemented on top.
+
+* The BPF scheduler can group CPUs however it sees fit and schedule them
+  together, as tasks aren't tied to specific CPUs at the time of wakeup.
+
+* The BPF scheduler can be turned on and off dynamically anytime.
+
+* The system integrity is maintained no matter what the BPF scheduler does.
+  The default scheduling behavior is restored anytime an error is detected,
+  a runnable task stalls, or on invoking the SysRq key sequence
+  :kbd:`SysRq-S`.
+
+Switching to and from sched_ext
+===============================
+
+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and
+``tools/sched_ext`` contains the example schedulers.
+
+sched_ext is used only when the BPF scheduler is loaded and running.
+
+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be
+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is
+loaded. On load, such tasks will be switched to and scheduled by sched_ext.
+
+The BPF scheduler can choose to schedule all normal and lower class tasks by
+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this
+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and
+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers,
+this mode can be selected with the ``-a`` option.
+
+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or
+detection of any internal error including stalled runnable tasks aborts the
+BPF scheduler and reverts all tasks back to CFS.
+
+.. code-block:: none
+
+    # make -j16 -C tools/sched_ext
+    # tools/sched_ext/scx_example_simple
+    local=0 global=3
+    local=5 global=24
+    local=9 global=44
+    local=13 global=56
+    local=17 global=72
+    ^CEXIT: BPF scheduler unregistered
+
+If ``CONFIG_SCHED_DEBUG`` is set, the current status of the BPF scheduler
+and whether a given task is on sched_ext can be determined as follows:
+
+.. code-block:: none
+
+    # cat /sys/kernel/debug/sched/ext
+    ops                           : simple
+    enabled                       : 1
+    switching_all                 : 1
+    switched_all                  : 1
+    enable_state                  : enabled
+
+    # grep ext /proc/self/sched
+    ext.enabled                                  :                    1
+
+The Basics
+==========
+
+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF
+programs that implement ``struct sched_ext_ops``. The only mandatory field
+is ``ops.name`` which must be a valid BPF object name. All operations are
+optional. The following modified excerpt is from
+``tools/sched/scx_example_simple.bpf.c`` showing a minimal global FIFO
+scheduler.
+
+.. code-block:: c
+
+    s32 BPF_STRUCT_OPS(simple_init)
+    {
+            if (!switch_partial)
+                    scx_bpf_switch_all();
+            return 0;
+    }
+
+    void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+    {
+            if (enq_flags & SCX_ENQ_LOCAL)
+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, enq_flags);
+            else
+                    scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, enq_flags);
+    }
+
+    void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
+    {
+            exit_type = ei->type;
+    }
+
+    SEC(".struct_ops")
+    struct sched_ext_ops simple_ops = {
+            .enqueue                = (void *)simple_enqueue,
+            .init                   = (void *)simple_init,
+            .exit                   = (void *)simple_exit,
+            .name                   = "simple",
+    };
+
+Dispatch Queues
+---------------
+
+To match the impedance between the scheduler core and the BPF scheduler,
+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a
+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``),
+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage
+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and
+``scx_bpf_destroy_dsq()``.
+
+A CPU always executes a task from its local DSQ. A task is "dispatched" to a
+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's
+local DSQ.
+
+When a CPU is looking for the next task to run, if the local DSQ is not
+empty, the first task is picked. Otherwise, the CPU tries to consume the
+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()``
+is invoked.
+
+Scheduling Cycle
+----------------
+
+The following briefly shows how a waking task is scheduled and executed.
+
+1. When a task is waking up, ``ops.select_cpu()`` is the first operation
+   invoked. This serves two purposes. First, CPU selection optimization
+   hint. Second, waking up the selected CPU if idle.
+
+   The CPU selected by ``ops.select_cpu()`` is an optimization hint and not
+   binding. The actual decision is made at the last step of scheduling.
+   However, there is a small performance gain if the CPU
+   ``ops.select_cpu()`` returns matches the CPU the task eventually runs on.
+
+   A side-effect of selecting a CPU is waking it up from idle. While a BPF
+   scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
+   using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
+
+   Note that the scheduler core will ignore an invalid CPU selection, for
+   example, if it's outside the allowed cpumask of the task.
+
+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked. It can
+   make one of the following decisions:
+
+   * Immediately dispatch the task to either the global or local DSQ by
+     calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
+     ``SCX_DSQ_LOCAL``, respectively.
+
+   * Immediately dispatch the task to a custom DSQ by calling
+     ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63.
+
+   * Queue the task on the BPF side.
+
+3. When a CPU is ready to schedule, it first looks at its local DSQ. If
+   empty, it then looks at the global DSQ. If there still isn't a task to
+   run, ``ops.dispatch()`` is invoked which can use the following two
+   functions to populate the local DSQ.
+
+   * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can
+     be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``,
+     ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()``
+     currently can't be called with BPF locks held, this is being worked on
+     and will be supported. ``scx_bpf_dispatch()`` schedules dispatching
+     rather than performing them immediately. There can be up to
+     ``ops.dispatch_max_batch`` pending tasks.
+
+   * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ
+     to the dispatching DSQ. This function cannot be called with any BPF
+     locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks
+     before trying to consume the specified DSQ.
+
+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ,
+   the CPU runs the first one. If empty, the following steps are taken:
+
+   * Try to consume the global DSQ. If successful, run the task.
+
+   * If ``ops.dispatch()`` has dispatched any tasks, retry #3.
+
+   * If the previous task is an SCX task and still runnable, keep executing
+     it (see ``SCX_OPS_ENQ_LAST``).
+
+   * Go idle.
+
+Note that the BPF scheduler can always choose to dispatch tasks immediately
+in ``ops.enqueue()`` as illustrated in the above simple example. If only the
+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as
+a task is never queued on the BPF scheduler and both the local and global
+DSQs are consumed automatically.
+
+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
+``scx_bpf_dispatch_vtime()`` for the priority queue. See the function
+documentation and usage in ``tools/sched_ext/scx_example_simple.bpf.c`` for
+more information.
+
+Where to Look
+=============
+
+* ``include/linux/sched/ext.h`` defines the core data structures, ops table
+  and constants.
+
+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers.
+  The functions prefixed with ``scx_bpf_`` can be called from the BPF
+  scheduler.
+
+* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
+
+  * ``scx_example_simple[.bpf].c``: Minimal global FIFO scheduler example
+    using a custom DSQ.
+
+  * ``scx_example_qmap[.bpf].c``: A multi-level FIFO scheduler supporting
+    five levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
+
+ABI Instability
+===============
+
+The APIs provided by sched_ext to BPF schedulers programs have no stability
+guarantees. This includes the ops table callbacks and constants defined in
+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
+``kernel/sched/ext.c``.
+
+While we will attempt to provide a relatively stable API surface when
+possible, they are subject to change without warning between kernel
+versions.
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index fe2b051230b28..61837aac8ab3e 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index e12a057ead7b6..bae49b743834b 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -154,3 +154,5 @@ config SCHED_CLASS_EXT
 	  wish to implement scheduling policies. The struct_ops structure
 	  exported by sched_ext is struct sched_ext_ops, and is conceptually
 	  similar to struct sched_class.
+
+	  See Documentation/scheduler/sched-ext.rst for more details.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3515a3fba2837..5d115c047992a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index b5a31fae21689..998b790b39288 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>

From 4d289aa3157487d988d7895a3c9747a7b086bf99 Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Thu, 13 Apr 2023 06:40:08 -1000
Subject: [PATCH 031/304] sched_ext: Add a basic, userland vruntime scheduler

This patch adds a new scx_example_userland BPF scheduler that implements a
fairly unsophisticated sorted-list vruntime scheduler in userland to
demonstrate how most scheduling decisions can be delegated to userland. The
scheduler doesn't implement load balancing, and treats all tasks as part of
a single domain.

v2: * Converted to BPF inline iterators.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/.gitignore                    |   1 +
 tools/sched_ext/Makefile                      |   9 +-
 tools/sched_ext/scx_example_userland.bpf.c    | 269 ++++++++++++
 tools/sched_ext/scx_example_userland.c        | 402 ++++++++++++++++++
 tools/sched_ext/scx_example_userland_common.h |  19 +
 5 files changed, 698 insertions(+), 2 deletions(-)
 create mode 100644 tools/sched_ext/scx_example_userland.bpf.c
 create mode 100644 tools/sched_ext/scx_example_userland.c
 create mode 100644 tools/sched_ext/scx_example_userland_common.h

diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index 769bc6f35cc64..a3240f9f7ebae 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -3,6 +3,7 @@ scx_example_qmap
 scx_example_central
 scx_example_pair
 scx_example_flatcg
+scx_example_userland
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 8c7543bbff8d7..71b5809243e30 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -116,7 +116,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -O2 -mcpu=v3
 
 all: scx_example_simple scx_example_qmap scx_example_central scx_example_pair	\
-     scx_example_flatcg
+     scx_example_flatcg scx_example_userland
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -187,11 +187,16 @@ scx_example_flatcg: scx_example_flatcg.c scx_example_flatcg.skel.h user_exit_inf
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+scx_example_userland: scx_example_userland.c scx_example_userland.skel.h	\
+		      scx_example_userland_common.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
 clean:
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_example_simple scx_example_qmap scx_example_central		\
-	      scx_example_pair scx_example_flatcg
+	      scx_example_pair scx_example_flatcg scx_example_userland
 
 .PHONY: all clean
 
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
new file mode 100644
index 0000000000000..a089bc6bbe868
--- /dev/null
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -0,0 +1,269 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal userland scheduler.
+ *
+ * In terms of scheduling, this provides two different types of behaviors:
+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
+ *    All such tasks are direct-dispatched from the kernel, and are never
+ *    enqueued in user space.
+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
+ *    other tasks.
+ *
+ * Some parts of this example user space scheduler could be implemented more
+ * efficiently using more complex and sophisticated data structures. For
+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
+ * in user space, but an rbtree could be used instead.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <string.h>
+#include "scx_common.bpf.h"
+#include "scx_example_userland_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool switch_partial;
+const volatile s32 usersched_pid;
+
+/* !0 for veristat, set during init */
+const volatile u32 num_possible_cpus = 64;
+
+/* Stats that are printed by user space. */
+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
+
+struct user_exit_info uei;
+
+/*
+ * Whether the user space scheduler needs to be scheduled due to a task being
+ * enqueued in user space.
+ */
+static bool usersched_needed;
+
+/*
+ * The map containing tasks that are enqueued in user space from the kernel.
+ *
+ * This map is drained by the user space scheduler.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, struct scx_userland_enqueued_task);
+} enqueued SEC(".maps");
+
+/*
+ * The map containing tasks that are dispatched to the kernel from user space.
+ *
+ * Drained by the kernel in userland_dispatch().
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, s32);
+} dispatched SEC(".maps");
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool force_local; /* Dispatch directly to local DSQ */
+};
+
+/* Map that contains task-local storage. */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static bool is_usersched_task(const struct task_struct *p)
+{
+	return p->pid == usersched_pid;
+}
+
+static bool keep_in_kernel(const struct task_struct *p)
+{
+	return p->nr_cpus_allowed < num_possible_cpus;
+}
+
+static struct task_struct *usersched_task(void)
+{
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(usersched_pid);
+	/*
+	 * Should never happen -- the usersched task should always be managed
+	 * by sched_ext.
+	 */
+	if (!p) {
+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
+		/*
+		 * We should never hit this path, and we error out of the
+		 * scheduler above just in case, so the scheduler will soon be
+		 * be evicted regardless. So as to simplify the logic in the
+		 * caller to not have to check for NULL, return an acquired
+		 * reference to the current task here rather than NULL.
+		 */
+		return bpf_task_acquire(bpf_get_current_task_btf());
+	}
+
+	return p;
+}
+
+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	if (keep_in_kernel(p)) {
+		s32 cpu;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
+			return -ESRCH;
+		}
+
+		if (p->nr_cpus_allowed == 1 ||
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			tctx->force_local = true;
+			return prev_cpu;
+		}
+
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		if (cpu >= 0) {
+			tctx->force_local = true;
+			return cpu;
+		}
+	}
+
+	return prev_cpu;
+}
+
+static void dispatch_user_scheduler(void)
+{
+	struct task_struct *p;
+
+	usersched_needed = false;
+	p = usersched_task();
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	bpf_task_release(p);
+}
+
+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
+{
+	struct scx_userland_enqueued_task task;
+
+	memset(&task, 0, sizeof(task));
+	task.pid = p->pid;
+	task.sum_exec_runtime = p->se.sum_exec_runtime;
+	task.weight = p->scx.weight;
+
+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
+		/*
+		 * If we fail to enqueue the task in user space, put it
+		 * directly on the global DSQ.
+		 */
+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		__sync_fetch_and_add(&nr_user_enqueues, 1);
+		usersched_needed = true;
+	}
+}
+
+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (keep_in_kernel(p)) {
+		u64 dsq_id = SCX_DSQ_GLOBAL;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
+			return;
+		}
+
+		if (tctx->force_local)
+			dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
+		return;
+	} else if (!is_usersched_task(p)) {
+		enqueue_task_in_user_space(p, enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (usersched_needed)
+		dispatch_user_scheduler();
+
+	bpf_repeat(4096) {
+		s32 pid;
+		struct task_struct *p;
+
+		if (bpf_map_pop_elem(&dispatched, &pid))
+			break;
+
+		/*
+		 * The task could have exited by the time we get around to
+		 * dispatching it. Treat this as a normal occurrence, and simply
+		 * move onto the next iteration.
+		 */
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			continue;
+
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(userland_init)
+{
+	if (num_possible_cpus == 0) {
+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
+			      num_possible_cpus);
+		return -EINVAL;
+	}
+
+	if (usersched_pid <= 0) {
+		scx_bpf_error("User scheduler pid uninitialized (%d)",
+			      usersched_pid);
+		return -EINVAL;
+	}
+
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops userland_ops = {
+	.select_cpu		= (void *)userland_select_cpu,
+	.enqueue		= (void *)userland_enqueue,
+	.dispatch		= (void *)userland_dispatch,
+	.prep_enable		= (void *)userland_prep_enable,
+	.init			= (void *)userland_init,
+	.exit			= (void *)userland_exit,
+	.timeout_ms		= 3000,
+	.name			= "userland",
+};
diff --git a/tools/sched_ext/scx_example_userland.c b/tools/sched_ext/scx_example_userland.c
new file mode 100644
index 0000000000000..4152b1e65fe1a
--- /dev/null
+++ b/tools/sched_ext/scx_example_userland.c
@@ -0,0 +1,402 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext user space scheduler which provides vruntime semantics
+ * using a simple ordered-list implementation.
+ *
+ * Each CPU in the system resides in a single, global domain. This precludes
+ * the need to do any load balancing between domains. The scheduler could
+ * easily be extended to support multiple domains, with load balancing
+ * happening in user space.
+ *
+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
+ * program only schedules tasks which may run on any CPU.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include "user_exit_info.h"
+#include "scx_example_userland_common.h"
+#include "scx_example_userland.skel.h"
+
+const char help_fmt[] =
+"A minimal userland sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-b BATCH] [-p]\n"
+"\n"
+"  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
+"  -p            Don't switch all, switch only tasks on SCHED_EXT policy\n"
+"  -h            Display this help and exit\n";
+
+/* Defined in UAPI */
+#define SCHED_EXT 7
+
+/* Number of tasks to batch when dispatching to user space. */
+static __u32 batch_size = 8;
+
+static volatile int exit_req;
+static int enqueued_fd, dispatched_fd;
+
+static struct scx_example_userland *skel;
+static struct bpf_link *ops_link;
+
+/* Stats collected in user space. */
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
+
+/* The data structure containing tasks that are enqueued in user space. */
+struct enqueued_task {
+	LIST_ENTRY(enqueued_task) entries;
+	__u64 sum_exec_runtime;
+	double vruntime;
+};
+
+/*
+ * Use a vruntime-sorted list to store tasks. This could easily be extended to
+ * a more optimal data structure, such as an rbtree as is done in CFS. We
+ * currently elect to use a sorted list to simplify the example for
+ * illustrative purposes.
+ */
+LIST_HEAD(listhead, enqueued_task);
+
+/*
+ * A vruntime-sorted list of tasks. The head of the list contains the task with
+ * the lowest vruntime. That is, the task that has the "highest" claim to be
+ * scheduled.
+ */
+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
+
+/*
+ * The statically allocated array of tasks. We use a statically allocated list
+ * here to avoid having to allocate on the enqueue path, which could cause a
+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
+ * for newly enabled tasks that are passed to the scheduler from the
+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
+ */
+struct enqueued_task tasks[USERLAND_MAX_TASKS];
+
+static double min_vruntime;
+
+static void sigint_handler(int userland)
+{
+	exit_req = 1;
+}
+
+static __u32 task_pid(const struct enqueued_task *task)
+{
+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
+}
+
+static int dispatch_task(s32 pid)
+{
+	int err;
+
+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
+	if (err) {
+		fprintf(stderr, "Failed to dispatch task %d\n", pid);
+		exit_req = 1;
+	} else {
+		nr_vruntime_dispatches++;
+	}
+
+	return err;
+}
+
+static struct enqueued_task *get_enqueued_task(__s32 pid)
+{
+	if (pid >= USERLAND_MAX_TASKS)
+		return NULL;
+
+	return &tasks[pid];
+}
+
+static double calc_vruntime_delta(__u64 weight, __u64 delta)
+{
+	double weight_f = (double)weight / 100.0;
+	double delta_f = (double)delta;
+
+	return delta_f / weight_f;
+}
+
+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task)
+{
+	__u64 delta;
+
+	delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime;
+
+	enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta);
+	if (min_vruntime > enqueued->vruntime)
+		enqueued->vruntime = min_vruntime;
+	enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime;
+}
+
+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
+{
+	struct enqueued_task *curr, *enqueued, *prev;
+
+	curr = get_enqueued_task(bpf_task->pid);
+	if (!curr)
+		return ENOENT;
+
+	update_enqueued(curr, bpf_task);
+	nr_vruntime_enqueues++;
+
+	/*
+	 * Enqueue the task in a vruntime-sorted list. A more optimal data
+	 * structure such as an rbtree could easily be used as well. We elect
+	 * to use a list here simply because it's less code, and thus the
+	 * example is less convoluted and better serves to illustrate what a
+	 * user space scheduler could look like.
+	 */
+
+	if (LIST_EMPTY(&vruntime_head)) {
+		LIST_INSERT_HEAD(&vruntime_head, curr, entries);
+		return 0;
+	}
+
+	LIST_FOREACH(enqueued, &vruntime_head, entries) {
+		if (curr->vruntime <= enqueued->vruntime) {
+			LIST_INSERT_BEFORE(enqueued, curr, entries);
+			return 0;
+		}
+		prev = enqueued;
+	}
+
+	LIST_INSERT_AFTER(prev, curr, entries);
+
+	return 0;
+}
+
+static void drain_enqueued_map(void)
+{
+	while (1) {
+		struct scx_userland_enqueued_task task;
+		int err;
+
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task))
+			return;
+
+		err = vruntime_enqueue(&task);
+		if (err) {
+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
+				task.pid, strerror(err));
+			exit_req = 1;
+			return;
+		}
+	}
+}
+
+static void dispatch_batch(void)
+{
+	__u32 i;
+
+	for (i = 0; i < batch_size; i++) {
+		struct enqueued_task *task;
+		int err;
+		__s32 pid;
+
+		task = LIST_FIRST(&vruntime_head);
+		if (!task)
+			return;
+
+		min_vruntime = task->vruntime;
+		pid = task_pid(task);
+		LIST_REMOVE(task, entries);
+		err = dispatch_task(pid);
+		if (err) {
+			fprintf(stderr, "Failed to dispatch task %d in %u\n",
+				pid, i);
+			return;
+		}
+	}
+}
+
+static void *run_stats_printer(void *arg)
+{
+	while (!exit_req) {
+		__u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
+
+		nr_failed_enqueues = skel->bss->nr_failed_enqueues;
+		nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
+		nr_user_enqueues = skel->bss->nr_user_enqueues;
+		total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
+
+		printf("o-----------------------o\n");
+		printf("| BPF ENQUEUES          |\n");
+		printf("|-----------------------|\n");
+		printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		printf("|  user:     %10llu |\n", nr_user_enqueues);
+		printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		printf("|  -------------------- |\n");
+		printf("|  total:    %10llu |\n", total);
+		printf("|                       |\n");
+		printf("|-----------------------|\n");
+		printf("| VRUNTIME / USER       |\n");
+		printf("|-----------------------|\n");
+		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		printf("o-----------------------o\n");
+		printf("\n\n");
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int spawn_stats_thread(void)
+{
+	pthread_t stats_printer;
+
+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
+}
+
+static int bootstrap(int argc, char **argv)
+{
+	int err;
+	__u32 opt;
+	struct sched_param sched_param = {
+		.sched_priority = sched_get_priority_max(SCHED_EXT),
+	};
+	bool switch_partial = false;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	/*
+	 * Enforce that the user scheduler task is managed by sched_ext. The
+	 * task eagerly drains the list of enqueued tasks in its main work
+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
+	 * user space scheduler task when at least one other task in the system
+	 * needs to be scheduled.
+	 */
+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
+	if (err) {
+		fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err));
+		return err;
+	}
+
+	while ((opt = getopt(argc, argv, "b:ph")) != -1) {
+		switch (opt) {
+		case 'b':
+			batch_size = strtoul(optarg, NULL, 0);
+			break;
+		case 'p':
+			switch_partial = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			exit(opt != 'h');
+		}
+	}
+
+	/*
+	 * It's not always safe to allocate in a user space scheduler, as an
+	 * enqueued task could hold a lock that we require in order to be able
+	 * to allocate.
+	 */
+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
+	if (err) {
+		fprintf(stderr, "Failed to prefault and lock address space: %s\n",
+			strerror(err));
+		return err;
+	}
+
+	skel = scx_example_userland__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno));
+		return errno;
+	}
+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->num_possible_cpus > 0);
+	skel->rodata->usersched_pid = getpid();
+	assert(skel->rodata->usersched_pid > 0);
+	skel->rodata->switch_partial = switch_partial;
+
+	err = scx_example_userland__load(skel);
+	if (err) {
+		fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
+	assert(enqueued_fd > 0);
+	assert(dispatched_fd > 0);
+
+	err = spawn_stats_thread();
+	if (err) {
+		fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops);
+	if (!ops_link) {
+		fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno));
+		err = errno;
+		goto destroy_skel;
+	}
+
+	return 0;
+
+destroy_skel:
+	scx_example_userland__destroy(skel);
+	exit_req = 1;
+	return err;
+}
+
+static void sched_main_loop(void)
+{
+	while (!exit_req) {
+		/*
+		 * Perform the following work in the main user space scheduler
+		 * loop:
+		 *
+		 * 1. Drain all tasks from the enqueued map, and enqueue them
+		 *    to the vruntime sorted list.
+		 *
+		 * 2. Dispatch a batch of tasks from the vruntime sorted list
+		 *    down to the kernel.
+		 *
+		 * 3. Yield the CPU back to the system. The BPF scheduler will
+		 *    reschedule the user space scheduler once another task has
+		 *    been enqueued to user space.
+		 */
+		drain_enqueued_map();
+		dispatch_batch();
+		sched_yield();
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int err;
+
+	err = bootstrap(argc, argv);
+	if (err) {
+		fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err));
+		return err;
+	}
+
+	sched_main_loop();
+
+	exit_req = 1;
+	bpf_link__destroy(ops_link);
+	uei_print(&skel->bss->uei);
+	scx_example_userland__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_userland_common.h b/tools/sched_ext/scx_example_userland_common.h
new file mode 100644
index 0000000000000..639c6809c5ffe
--- /dev/null
+++ b/tools/sched_ext/scx_example_userland_common.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta, Inc */
+
+#ifndef __SCX_USERLAND_COMMON_H
+#define __SCX_USERLAND_COMMON_H
+
+#define USERLAND_MAX_TASKS 8192
+
+/*
+ * An instance of a task that has been enqueued by the kernel for consumption
+ * by a user space global scheduler thread.
+ */
+struct scx_userland_enqueued_task {
+	__s32 pid;
+	u64 sum_exec_runtime;
+	u64 weight;
+};
+
+#endif  // __SCX_USERLAND_COMMON_H

From af73020854c08cd7518a676c90c8cbe6d828a43c Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <dschatzberg@meta.com>
Date: Thu, 13 Apr 2023 06:40:08 -1000
Subject: [PATCH 032/304] sched_ext: Add a rust userspace hybrid example
 scheduler

Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
part does simple round robin in each domain and the userspace part
calculates the load factor of each domain and tells the BPF part how to load
balance the domains.

This scheduler demonstrates dividing scheduling logic between BPF and
userspace and using rust to build the userspace part. An earlier variant of
this scheduler was used to balance across six domains, each representing a
chiplet in a six-chiplet AMD processor, and could match the performance of
production setup using CFS.

v3: * The userspace code is substantially restructured and rewritten. The
      binary is renamed to scx_atropos and can now figure out the domain
      topology automatically based on L3 cache configuration. The LB logic
      which was rather broken in the previous postings are revamped and
      should behave better.

    * Updated to support weighted vtime scheduling (can be turned off with
      --fifo-sched). Added a couple options (--slice_us, --kthreads-local)
      to modify scheduling behaviors.

    * Converted to use BPF inline iterators.

v2: * Updated to use generic BPF cpumask helpers.

Signed-off-by: Dan Schatzberg <dschatzberg@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile                      |  13 +-
 tools/sched_ext/atropos/.gitignore            |   3 +
 tools/sched_ext/atropos/Cargo.toml            |  28 +
 tools/sched_ext/atropos/build.rs              |  70 ++
 tools/sched_ext/atropos/rustfmt.toml          |   8 +
 tools/sched_ext/atropos/src/atropos_sys.rs    |  10 +
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 743 ++++++++++++++
 tools/sched_ext/atropos/src/bpf/atropos.h     |  44 +
 tools/sched_ext/atropos/src/main.rs           | 942 ++++++++++++++++++
 9 files changed, 1859 insertions(+), 2 deletions(-)
 create mode 100644 tools/sched_ext/atropos/.gitignore
 create mode 100644 tools/sched_ext/atropos/Cargo.toml
 create mode 100644 tools/sched_ext/atropos/build.rs
 create mode 100644 tools/sched_ext/atropos/rustfmt.toml
 create mode 100644 tools/sched_ext/atropos/src/atropos_sys.rs
 create mode 100644 tools/sched_ext/atropos/src/bpf/atropos.bpf.c
 create mode 100644 tools/sched_ext/atropos/src/bpf/atropos.h
 create mode 100644 tools/sched_ext/atropos/src/main.rs

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 71b5809243e30..73c43782837d4 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -85,6 +85,8 @@ CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
 	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
 	  -I$(TOOLSINCDIR) -I$(APIDIR)
 
+CARGOFLAGS := --release
+
 # Silence some warnings when compiled with clang
 ifneq ($(LLVM),)
 CFLAGS += -Wno-unused-command-line-argument
@@ -116,7 +118,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -O2 -mcpu=v3
 
 all: scx_example_simple scx_example_qmap scx_example_central scx_example_pair	\
-     scx_example_flatcg scx_example_userland
+     scx_example_flatcg scx_example_userland atropos
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -192,13 +194,20 @@ scx_example_userland: scx_example_userland.c scx_example_userland.skel.h	\
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+atropos: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
+atropos: export ATROPOS_CLANG = $(CLANG)
+atropos: export ATROPOS_BPF_CFLAGS = $(BPF_CFLAGS)
+atropos: $(INCLUDE_DIR)/vmlinux.h
+	cargo build --manifest-path=atropos/Cargo.toml $(CARGOFLAGS)
+
 clean:
+	cargo clean --manifest-path=atropos/Cargo.toml
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_example_simple scx_example_qmap scx_example_central		\
 	      scx_example_pair scx_example_flatcg scx_example_userland
 
-.PHONY: all clean
+.PHONY: all atropos clean
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/atropos/.gitignore b/tools/sched_ext/atropos/.gitignore
new file mode 100644
index 0000000000000..186dba259ec21
--- /dev/null
+++ b/tools/sched_ext/atropos/.gitignore
@@ -0,0 +1,3 @@
+src/bpf/.output
+Cargo.lock
+target
diff --git a/tools/sched_ext/atropos/Cargo.toml b/tools/sched_ext/atropos/Cargo.toml
new file mode 100644
index 0000000000000..7462a836d53dd
--- /dev/null
+++ b/tools/sched_ext/atropos/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "scx_atropos"
+version = "0.5.0"
+authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
+edition = "2021"
+description = "Userspace scheduling with BPF"
+license = "GPL-2.0-only"
+
+[dependencies]
+anyhow = "1.0.65"
+bitvec = { version = "1.0", features = ["serde"] }
+clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
+ctrlc = { version = "3.1", features = ["termination"] }
+fb_procfs = { git = "https://github.com/facebookincubator/below.git", rev = "f305730"}
+hex = "0.4.3"
+libbpf-rs = "0.19.1"
+libbpf-sys = { version = "1.0.4", features = ["novendor", "static"] }
+libc = "0.2.137"
+log = "0.4.17"
+ordered-float = "3.4.0"
+simplelog = "0.12.0"
+
+[build-dependencies]
+bindgen = { version = "0.61.0", features = ["logging", "static"], default-features = false }
+libbpf-cargo = "0.13.0"
+
+[features]
+enable_backtrace = []
diff --git a/tools/sched_ext/atropos/build.rs b/tools/sched_ext/atropos/build.rs
new file mode 100644
index 0000000000000..26e792c5e17e9
--- /dev/null
+++ b/tools/sched_ext/atropos/build.rs
@@ -0,0 +1,70 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+extern crate bindgen;
+
+use std::env;
+use std::fs::create_dir_all;
+use std::path::Path;
+use std::path::PathBuf;
+
+use libbpf_cargo::SkeletonBuilder;
+
+const HEADER_PATH: &str = "src/bpf/atropos.h";
+
+fn bindgen_atropos() {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed={}", HEADER_PATH);
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header(HEADER_PATH)
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("atropos-sys.rs"))
+        .expect("Couldn't write bindings!");
+}
+
+fn gen_bpf_sched(name: &str) {
+    let bpf_cflags = env::var("ATROPOS_BPF_CFLAGS").unwrap();
+    let clang = env::var("ATROPOS_CLANG").unwrap();
+    eprintln!("{}", clang);
+    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
+    let skel = Path::new(&outpath);
+    let src = format!("./src/bpf/{}.bpf.c", name);
+    SkeletonBuilder::new()
+        .source(src.clone())
+        .clang(clang)
+        .clang_args(bpf_cflags)
+        .build_and_generate(&skel)
+        .unwrap();
+    println!("cargo:rerun-if-changed={}", src);
+}
+
+fn main() {
+    bindgen_atropos();
+    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
+    // Reasons are because the generated skeleton contains compiler attributes
+    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
+    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
+    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
+    //
+    // However, there is hope! When the above feature stabilizes we can clean this
+    // all up.
+    create_dir_all("./src/bpf/.output").unwrap();
+    gen_bpf_sched("atropos");
+}
diff --git a/tools/sched_ext/atropos/rustfmt.toml b/tools/sched_ext/atropos/rustfmt.toml
new file mode 100644
index 0000000000000..b7258ed0a8d84
--- /dev/null
+++ b/tools/sched_ext/atropos/rustfmt.toml
@@ -0,0 +1,8 @@
+# Get help on options with `rustfmt --help=config`
+# Please keep these in alphabetical order.
+edition = "2021"
+group_imports = "StdExternalCrate"
+imports_granularity = "Item"
+merge_derives = false
+use_field_init_shorthand = true
+version = "Two"
diff --git a/tools/sched_ext/atropos/src/atropos_sys.rs b/tools/sched_ext/atropos/src/atropos_sys.rs
new file mode 100644
index 0000000000000..bbeaf856d40e8
--- /dev/null
+++ b/tools/sched_ext/atropos/src/atropos_sys.rs
@@ -0,0 +1,10 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+
+include!(concat!(env!("OUT_DIR"), "/atropos-sys.rs"));
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
new file mode 100644
index 0000000000000..3905a403e9406
--- /dev/null
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -0,0 +1,743 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+/*
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ *
+ * Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
+ * part does simple round robin in each domain and the userspace part
+ * calculates the load factor of each domain and tells the BPF part how to load
+ * balance the domains.
+ *
+ * Every task has an entry in the task_data map which lists which domain the
+ * task belongs to. When a task first enters the system (atropos_prep_enable),
+ * they are round-robined to a domain.
+ *
+ * atropos_select_cpu is the primary scheduling logic, invoked when a task
+ * becomes runnable. The lb_data map is populated by userspace to inform the BPF
+ * scheduler that a task should be migrated to a new domain. Otherwise, the task
+ * is scheduled in priority order as follows:
+ * * The current core if the task was woken up synchronously and there are idle
+ *   cpus in the system
+ * * The previous core, if idle
+ * * The pinned-to core if the task is pinned to a specific core
+ * * Any idle cpu in the domain
+ *
+ * If none of the above conditions are met, then the task is enqueued to a
+ * dispatch queue corresponding to the domain (atropos_enqueue).
+ *
+ * atropos_dispatch will attempt to consume a task from its domain's
+ * corresponding dispatch queue (this occurs after scheduling any tasks directly
+ * assigned to it due to the logic in atropos_select_cpu). If no task is found,
+ * then greedy load stealing will attempt to find a task on another dispatch
+ * queue to run.
+ *
+ * Load balancing is almost entirely handled by userspace. BPF populates the
+ * task weight, dom mask and current dom in the task_data map and executes the
+ * load balance based on userspace populating the lb_data map.
+ */
+#include "../../../scx_common.bpf.h"
+#include "atropos.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+/*
+ * const volatiles are set during initialization and treated as consts by the
+ * jit compiler.
+ */
+
+/*
+ * Domains and cpus
+ */
+const volatile __u32 nr_doms = 32;	/* !0 for veristat, set during init */
+const volatile __u32 nr_cpus = 64;	/* !0 for veristat, set during init */
+const volatile __u32 cpu_dom_id_map[MAX_CPUS];
+const volatile __u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
+
+const volatile bool kthreads_local;
+const volatile bool fifo_sched;
+const volatile bool switch_partial;
+const volatile __u32 greedy_threshold;
+
+/* base slice duration */
+const volatile __u64 slice_us = 20000;
+
+/*
+ * Exit info
+ */
+int exit_type = SCX_EXIT_NONE;
+char exit_msg[SCX_EXIT_MSG_LEN];
+
+struct pcpu_ctx {
+	__u32 dom_rr_cur; /* used when scanning other doms */
+
+	/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
+	__u8 _padding[CACHELINE_SIZE - sizeof(u64)];
+} __attribute__((aligned(CACHELINE_SIZE)));
+
+struct pcpu_ctx pcpu_ctx[MAX_CPUS];
+
+/*
+ * Domain context
+ */
+struct dom_ctx {
+	struct bpf_cpumask __kptr *cpumask;
+	u64 vtime_now;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct dom_ctx);
+	__uint(max_entries, MAX_DOMS);
+	__uint(map_flags, 0);
+} dom_ctx SEC(".maps");
+
+/*
+ * Statistics
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+	__uint(max_entries, ATROPOS_NR_STATS);
+} stats SEC(".maps");
+
+static inline void stat_add(enum stat_idx idx, u64 addend)
+{
+	u32 idx_v = idx;
+
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
+	if (cnt_p)
+		(*cnt_p) += addend;
+}
+
+/* Map pid -> task_ctx */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, struct task_ctx);
+	__uint(max_entries, 1000000);
+	__uint(map_flags, 0);
+} task_data SEC(".maps");
+
+/*
+ * This is populated from userspace to indicate which pids should be reassigned
+ * to new doms.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, u32);
+	__uint(max_entries, 1000);
+	__uint(map_flags, 0);
+} lb_data SEC(".maps");
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static bool task_set_dsq(struct task_ctx *task_ctx, struct task_struct *p,
+			 u32 new_dom_id)
+{
+	struct dom_ctx *old_domc, *new_domc;
+	struct bpf_cpumask *d_cpumask, *t_cpumask;
+	u32 old_dom_id = task_ctx->dom_id;
+	s64 vtime_delta;
+
+	old_domc = bpf_map_lookup_elem(&dom_ctx, &old_dom_id);
+	if (!old_domc) {
+		scx_bpf_error("No dom%u", old_dom_id);
+		return false;
+	}
+
+	vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now;
+
+	new_domc = bpf_map_lookup_elem(&dom_ctx, &new_dom_id);
+	if (!new_domc) {
+		scx_bpf_error("No dom%u", new_dom_id);
+		return false;
+	}
+
+	d_cpumask = new_domc->cpumask;
+	if (!d_cpumask) {
+		scx_bpf_error("Failed to get domain %u cpumask kptr",
+			      new_dom_id);
+		return false;
+	}
+
+	t_cpumask = task_ctx->cpumask;
+	if (!t_cpumask) {
+		scx_bpf_error("Failed to look up task cpumask");
+		return false;
+	}
+
+	/*
+	 * set_cpumask might have happened between userspace requesting LB and
+	 * here and @p might not be able to run in @dom_id anymore. Verify.
+	 */
+	if (bpf_cpumask_intersects((const struct cpumask *)d_cpumask,
+				   p->cpus_ptr)) {
+		p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta;
+		task_ctx->dom_id = new_dom_id;
+		bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask,
+				p->cpus_ptr);
+	}
+
+	return task_ctx->dom_id == new_dom_id;
+}
+
+s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
+		   u32 wake_flags)
+{
+	s32 cpu;
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	struct bpf_cpumask *p_cpumask;
+
+	if (!task_ctx)
+		return -ENOENT;
+
+	if (kthreads_local &&
+	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+		cpu = prev_cpu;
+		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+		goto local;
+	}
+
+	/*
+	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
+	 * local dsq of the waker.
+	 */
+	if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) {
+		struct task_struct *current = (void *)bpf_get_current_task();
+
+		if (!(BPF_CORE_READ(current, flags) & PF_EXITING) &&
+		    task_ctx->dom_id < MAX_DOMS) {
+			struct dom_ctx *domc;
+			struct bpf_cpumask *d_cpumask;
+			const struct cpumask *idle_cpumask;
+			bool has_idle;
+
+			domc = bpf_map_lookup_elem(&dom_ctx, &task_ctx->dom_id);
+			if (!domc) {
+				scx_bpf_error("Failed to find dom%u",
+					      task_ctx->dom_id);
+				return prev_cpu;
+			}
+			d_cpumask = domc->cpumask;
+			if (!d_cpumask) {
+				scx_bpf_error("Failed to acquire domain %u cpumask kptr",
+					      task_ctx->dom_id);
+				return prev_cpu;
+			}
+
+			idle_cpumask = scx_bpf_get_idle_cpumask();
+
+			has_idle = bpf_cpumask_intersects((const struct cpumask *)d_cpumask,
+							  idle_cpumask);
+
+			scx_bpf_put_idle_cpumask(idle_cpumask);
+
+			if (has_idle) {
+				cpu = bpf_get_smp_processor_id();
+				if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+					stat_add(ATROPOS_STAT_WAKE_SYNC, 1);
+					goto local;
+				}
+			}
+		}
+	}
+
+	/* if the previous CPU is idle, dispatch directly to it */
+	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+		stat_add(ATROPOS_STAT_PREV_IDLE, 1);
+		cpu = prev_cpu;
+		goto local;
+	}
+
+	/* If only one core is allowed, dispatch */
+	if (p->nr_cpus_allowed == 1) {
+		stat_add(ATROPOS_STAT_PINNED, 1);
+		cpu = prev_cpu;
+		goto local;
+	}
+
+	p_cpumask = task_ctx->cpumask;
+	if (!p_cpumask)
+		return -ENOENT;
+
+	/* If there is an eligible idle CPU, dispatch directly */
+	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask);
+	if (cpu >= 0) {
+		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+		goto local;
+	}
+
+	/*
+	 * @prev_cpu may be in a different domain. Returning an out-of-domain
+	 * CPU can lead to stalls as all in-domain CPUs may be idle by the time
+	 * @p gets enqueued.
+	 */
+	if (bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *)p_cpumask))
+		cpu = prev_cpu;
+	else
+		cpu = bpf_cpumask_any((const struct cpumask *)p_cpumask);
+
+	return cpu;
+
+local:
+	task_ctx->dispatch_local = true;
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
+{
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	u32 *new_dom;
+
+	if (!task_ctx) {
+		scx_bpf_error("No task_ctx[%d]", pid);
+		return;
+	}
+
+	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
+	if (new_dom && *new_dom != task_ctx->dom_id &&
+	    task_set_dsq(task_ctx, p, *new_dom)) {
+		struct bpf_cpumask *p_cpumask;
+		s32 cpu;
+
+		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
+
+		/*
+		 * If dispatch_local is set, We own @p's idle state but we are
+		 * not gonna put the task in the associated local dsq which can
+		 * cause the CPU to stall. Kick it.
+		 */
+		if (task_ctx->dispatch_local) {
+			task_ctx->dispatch_local = false;
+			scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
+		}
+
+		p_cpumask = task_ctx->cpumask;
+		if (!p_cpumask) {
+			scx_bpf_error("Failed to get task_ctx->cpumask");
+			return;
+		}
+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask);
+
+		if (cpu >= 0)
+			scx_bpf_kick_cpu(cpu, 0);
+	}
+
+	if (task_ctx->dispatch_local) {
+		task_ctx->dispatch_local = false;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_us * 1000, enq_flags);
+		return;
+	}
+
+	if (fifo_sched) {
+		scx_bpf_dispatch(p, task_ctx->dom_id, slice_us * 1000,
+				 enq_flags);
+	} else {
+		u64 vtime = p->scx.dsq_vtime;
+		u32 dom_id = task_ctx->dom_id;
+		struct dom_ctx *domc;
+
+		domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
+		if (!domc) {
+			scx_bpf_error("No dom[%u]", dom_id);
+			return;
+		}
+
+		/*
+		 * Limit the amount of budget that an idling task can accumulate
+		 * to one slice.
+		 */
+		if (vtime_before(vtime, domc->vtime_now - slice_us * 1000))
+			vtime = domc->vtime_now - slice_us * 1000;
+
+		scx_bpf_dispatch_vtime(p, task_ctx->dom_id, SCX_SLICE_DFL, vtime,
+				       enq_flags);
+	}
+}
+
+static u32 cpu_to_dom_id(s32 cpu)
+{
+	const volatile u32 *dom_idp;
+
+	if (nr_doms <= 1)
+		return 0;
+
+	dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]);
+	if (!dom_idp)
+		return MAX_DOMS;
+
+	return *dom_idp;
+}
+
+static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
+{
+	s32 cpu;
+
+	if (dom_id >= MAX_DOMS)
+		return false;
+
+	bpf_for(cpu, 0, nr_cpus) {
+		if (bpf_cpumask_test_cpu(cpu, cpumask) &&
+		    (dom_cpumasks[dom_id][cpu / 64] & (1LLU << (cpu % 64))))
+			return true;
+	}
+	return false;
+}
+
+static u32 dom_rr_next(s32 cpu)
+{
+	struct pcpu_ctx *pcpuc;
+	u32 dom_id;
+
+	pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
+	if (!pcpuc)
+		return 0;
+
+	dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
+
+	if (dom_id == cpu_to_dom_id(cpu))
+		dom_id = (dom_id + 1) % nr_doms;
+
+	pcpuc->dom_rr_cur = dom_id;
+	return dom_id;
+}
+
+void BPF_STRUCT_OPS(atropos_dispatch, s32 cpu, struct task_struct *prev)
+{
+	u32 dom = cpu_to_dom_id(cpu);
+
+	if (scx_bpf_consume(dom)) {
+		stat_add(ATROPOS_STAT_DSQ_DISPATCH, 1);
+		return;
+	}
+
+	if (!greedy_threshold)
+		return;
+
+	bpf_repeat(nr_doms - 1) {
+		u32 dom_id = dom_rr_next(cpu);
+
+		if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
+		    scx_bpf_consume(dom_id)) {
+			stat_add(ATROPOS_STAT_GREEDY, 1);
+			break;
+		}
+	}
+}
+
+void BPF_STRUCT_OPS(atropos_runnable, struct task_struct *p, u64 enq_flags)
+{
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+
+	if (!task_ctx) {
+		scx_bpf_error("No task_ctx[%d]", pid);
+		return;
+	}
+
+	task_ctx->runnable_at = bpf_ktime_get_ns();
+}
+
+void BPF_STRUCT_OPS(atropos_running, struct task_struct *p)
+{
+	struct task_ctx *taskc;
+	struct dom_ctx *domc;
+	pid_t pid = p->pid;
+	u32 dom_id;
+
+	if (fifo_sched)
+		return;
+
+	taskc = bpf_map_lookup_elem(&task_data, &pid);
+	if (!taskc) {
+		scx_bpf_error("No task_ctx[%d]", pid);
+		return;
+	}
+	dom_id = taskc->dom_id;
+
+	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
+	if (!domc) {
+		scx_bpf_error("No dom[%u]", dom_id);
+		return;
+	}
+
+	/*
+	 * Global vtime always progresses forward as tasks start executing. The
+	 * test and update can be performed concurrently from multiple CPUs and
+	 * thus racy. Any error should be contained and temporary. Let's just
+	 * live with it.
+	 */
+	if (vtime_before(domc->vtime_now, p->scx.dsq_vtime))
+		domc->vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(atropos_stopping, struct task_struct *p, bool runnable)
+{
+	if (fifo_sched)
+		return;
+
+	/* scale the execution time by the inverse of the weight and charge */
+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(atropos_quiescent, struct task_struct *p, u64 deq_flags)
+{
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+
+	if (!task_ctx) {
+		scx_bpf_error("No task_ctx[%d]", pid);
+		return;
+	}
+
+	task_ctx->runnable_for += bpf_ktime_get_ns() - task_ctx->runnable_at;
+	task_ctx->runnable_at = 0;
+}
+
+void BPF_STRUCT_OPS(atropos_set_weight, struct task_struct *p, u32 weight)
+{
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+
+	if (!task_ctx) {
+		scx_bpf_error("No task_ctx[%d]", pid);
+		return;
+	}
+
+	task_ctx->weight = weight;
+}
+
+struct pick_task_domain_loop_ctx {
+	struct task_struct *p;
+	const struct cpumask *cpumask;
+	u64 dom_mask;
+	u32 dom_rr_base;
+	u32 dom_id;
+};
+
+static int pick_task_domain_loopfn(u32 idx, void *data)
+{
+	struct pick_task_domain_loop_ctx *lctx = data;
+	u32 dom_id = (lctx->dom_rr_base + idx) % nr_doms;
+
+	if (dom_id >= MAX_DOMS)
+		return 1;
+
+	if (cpumask_intersects_domain(lctx->cpumask, dom_id)) {
+		lctx->dom_mask |= 1LLU << dom_id;
+		if (lctx->dom_id == MAX_DOMS)
+			lctx->dom_id = dom_id;
+	}
+	return 0;
+}
+
+static u32 pick_task_domain(struct task_ctx *task_ctx, struct task_struct *p,
+			    const struct cpumask *cpumask)
+{
+	struct pick_task_domain_loop_ctx lctx = {
+		.p = p,
+		.cpumask = cpumask,
+		.dom_id = MAX_DOMS,
+	};
+	s32 cpu = bpf_get_smp_processor_id();
+
+	if (cpu < 0 || cpu >= MAX_CPUS)
+		return MAX_DOMS;
+
+	lctx.dom_rr_base = ++(pcpu_ctx[cpu].dom_rr_cur);
+
+	bpf_loop(nr_doms, pick_task_domain_loopfn, &lctx, 0);
+	task_ctx->dom_mask = lctx.dom_mask;
+
+	return lctx.dom_id;
+}
+
+static void task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
+			    const struct cpumask *cpumask)
+{
+	u32 dom_id = 0;
+
+	if (nr_doms > 1)
+		dom_id = pick_task_domain(task_ctx, p, cpumask);
+
+	if (!task_set_dsq(task_ctx, p, dom_id))
+		scx_bpf_error("Failed to set domain %d for %s[%d]",
+			      dom_id, p->comm, p->pid);
+}
+
+void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	if (!task_ctx) {
+		scx_bpf_error("No task_ctx[%d]", pid);
+		return;
+	}
+
+	task_set_domain(task_ctx, p, cpumask);
+}
+
+s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	struct bpf_cpumask *cpumask;
+	struct task_ctx task_ctx, *map_value;
+	long ret;
+	pid_t pid;
+
+	memset(&task_ctx, 0, sizeof(task_ctx));
+
+	pid = p->pid;
+	ret = bpf_map_update_elem(&task_data, &pid, &task_ctx, BPF_NOEXIST);
+	if (ret) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		return ret;
+	}
+
+	/*
+	 * Read the entry from the map immediately so we can add the cpumask
+	 * with bpf_kptr_xchg().
+	 */
+	map_value = bpf_map_lookup_elem(&task_data, &pid);
+	if (!map_value)
+		/* Should never happen -- it was just inserted above. */
+		return -EINVAL;
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		bpf_map_delete_elem(&task_data, &pid);
+		return -ENOMEM;
+	}
+
+	cpumask = bpf_kptr_xchg(&map_value->cpumask, cpumask);
+	if (cpumask) {
+		/* Should never happen as we just inserted it above. */
+		bpf_cpumask_release(cpumask);
+		bpf_map_delete_elem(&task_data, &pid);
+		return -EINVAL;
+	}
+
+	task_set_domain(map_value, p, p->cpus_ptr);
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(atropos_disable, struct task_struct *p)
+{
+	pid_t pid = p->pid;
+	long ret = bpf_map_delete_elem(&task_data, &pid);
+	if (ret) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		return;
+	}
+}
+
+static int create_dom_dsq(u32 idx, void *data)
+{
+	struct dom_ctx domc_init = {}, *domc;
+	struct bpf_cpumask *cpumask;
+	u32 cpu, dom_id = idx;
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(dom_id, -1);
+	if (ret < 0) {
+		scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret);
+		return 1;
+	}
+
+	ret = bpf_map_update_elem(&dom_ctx, &dom_id, &domc_init, 0);
+	if (ret) {
+		scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret);
+		return 1;
+	}
+
+	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
+	if (!domc) {
+		/* Should never happen, we just inserted it above. */
+		scx_bpf_error("No dom%u", dom_id);
+		return 1;
+	}
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		scx_bpf_error("Failed to create BPF cpumask for domain %u", dom_id);
+		return 1;
+	}
+
+	for (cpu = 0; cpu < MAX_CPUS; cpu++) {
+		const volatile __u64 *dmask;
+
+		dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]);
+		if (!dmask) {
+			scx_bpf_error("array index error");
+			bpf_cpumask_release(cpumask);
+			return 1;
+		}
+
+		if (*dmask & (1LLU << (cpu % 64)))
+			bpf_cpumask_set_cpu(cpu, cpumask);
+	}
+
+	cpumask = bpf_kptr_xchg(&domc->cpumask, cpumask);
+	if (cpumask) {
+		scx_bpf_error("Domain %u was already present", dom_id);
+		bpf_cpumask_release(cpumask);
+		return 1;
+	}
+
+	return 0;
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(atropos_init)
+{
+	if (!switch_partial)
+		scx_bpf_switch_all();
+
+	bpf_loop(nr_doms, create_dom_dsq, NULL, 0);
+
+	for (u32 i = 0; i < nr_cpus; i++)
+		pcpu_ctx[i].dom_rr_cur = i;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
+{
+	bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg);
+	exit_type = ei->type;
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops atropos = {
+	.select_cpu = (void *)atropos_select_cpu,
+	.enqueue = (void *)atropos_enqueue,
+	.dispatch = (void *)atropos_dispatch,
+	.runnable = (void *)atropos_runnable,
+	.running = (void *)atropos_running,
+	.stopping = (void *)atropos_stopping,
+	.quiescent = (void *)atropos_quiescent,
+	.set_weight = (void *)atropos_set_weight,
+	.set_cpumask = (void *)atropos_set_cpumask,
+	.prep_enable = (void *)atropos_prep_enable,
+	.disable = (void *)atropos_disable,
+	.init = (void *)atropos_init,
+	.exit = (void *)atropos_exit,
+	.flags = 0,
+	.name = "atropos",
+};
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/atropos/src/bpf/atropos.h
new file mode 100644
index 0000000000000..addf29ca104a5
--- /dev/null
+++ b/tools/sched_ext/atropos/src/bpf/atropos.h
@@ -0,0 +1,44 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#ifndef __ATROPOS_H
+#define __ATROPOS_H
+
+#include <stdbool.h>
+#ifndef __kptr
+#ifdef __KERNEL__
+#error "__kptr_ref not defined in the kernel"
+#endif
+#define __kptr
+#endif
+
+#define	MAX_CPUS 512
+#define	MAX_DOMS 64 /* limited to avoid complex bitmask ops */
+#define	CACHELINE_SIZE 64
+
+/* Statistics */
+enum stat_idx {
+	ATROPOS_STAT_TASK_GET_ERR,
+	ATROPOS_STAT_WAKE_SYNC,
+	ATROPOS_STAT_PREV_IDLE,
+	ATROPOS_STAT_PINNED,
+	ATROPOS_STAT_DIRECT_DISPATCH,
+	ATROPOS_STAT_DSQ_DISPATCH,
+	ATROPOS_STAT_GREEDY,
+	ATROPOS_STAT_LOAD_BALANCE,
+	ATROPOS_STAT_LAST_TASK,
+	ATROPOS_NR_STATS,
+};
+
+struct task_ctx {
+	unsigned long long dom_mask; /* the domains this task can run on */
+	struct bpf_cpumask __kptr *cpumask;
+	unsigned int dom_id;
+	unsigned int weight;
+	unsigned long long runnable_at;
+	unsigned long long runnable_for;
+	bool dispatch_local;
+};
+
+#endif /* __ATROPOS_H */
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
new file mode 100644
index 0000000000000..0d313662f7136
--- /dev/null
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -0,0 +1,942 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#[path = "bpf/.output/atropos.skel.rs"]
+mod atropos;
+pub use atropos::*;
+pub mod atropos_sys;
+
+use std::cell::Cell;
+use std::collections::{BTreeMap, BTreeSet};
+use std::ffi::CStr;
+use std::ops::Bound::{Included, Unbounded};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
+
+use ::fb_procfs as procfs;
+use anyhow::{anyhow, bail, Context, Result};
+use bitvec::prelude::*;
+use clap::Parser;
+use log::{info, trace, warn};
+use ordered_float::OrderedFloat;
+
+/// Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
+/// part does simple round robin in each domain and the userspace part
+/// calculates the load factor of each domain and tells the BPF part how to load
+/// balance the domains.
+
+/// This scheduler demonstrates dividing scheduling logic between BPF and
+/// userspace and using rust to build the userspace part. An earlier variant of
+/// this scheduler was used to balance across six domains, each representing a
+/// chiplet in a six-chiplet AMD processor, and could match the performance of
+/// production setup using CFS.
+#[derive(Debug, Parser)]
+struct Opts {
+    /// Scheduling slice duration in microseconds.
+    #[clap(short, long, default_value = "20000")]
+    slice_us: u64,
+
+    /// Monitoring and load balance interval in seconds.
+    #[clap(short, long, default_value = "2.0")]
+    interval: f64,
+
+    /// Build domains according to how CPUs are grouped at this cache level
+    /// as determined by /sys/devices/system/cpu/cpuX/cache/indexI/id.
+    #[clap(short = 'c', long, default_value = "3")]
+    cache_level: u32,
+
+    /// Instead of using cache locality, set the cpumask for each domain
+    /// manually, provide multiple --cpumasks, one for each domain. E.g.
+    /// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains with
+    /// the corresponding CPUs belonging to each domain. Each CPU must
+    /// belong to precisely one domain.
+    #[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")]
+    cpumasks: Vec<String>,
+
+    /// When non-zero, enable greedy task stealing. When a domain is idle, a
+    /// cpu will attempt to steal tasks from a domain with at least
+    /// greedy_threshold tasks enqueued. These tasks aren't permanently
+    /// stolen from the domain.
+    #[clap(short, long, default_value = "4")]
+    greedy_threshold: u32,
+
+    /// The load decay factor. Every interval, the existing load is decayed
+    /// by this factor and new load is added. Must be in the range [0.0,
+    /// 0.99]. The smaller the value, the more sensitive load calculation
+    /// is to recent changes. When 0.0, history is ignored and the load
+    /// value from the latest period is used directly.
+    #[clap(short, long, default_value = "0.5")]
+    load_decay_factor: f64,
+
+    /// Disable load balancing. Unless disabled, periodically userspace will
+    /// calculate the load factor of each domain and instruct BPF which
+    /// processes to move.
+    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    no_load_balance: bool,
+
+    /// Put per-cpu kthreads directly into local dsq's.
+    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    kthreads_local: bool,
+
+    /// Use FIFO scheduling instead of weighted vtime scheduling.
+    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    fifo_sched: bool,
+
+    /// If specified, only tasks which have their scheduling policy set to
+    /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
+    /// tasks are switched.
+    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    partial: bool,
+
+    /// Enable verbose output including libbpf details. Specify multiple
+    /// times to increase verbosity.
+    #[clap(short, long, action = clap::ArgAction::Count)]
+    verbose: u8,
+}
+
+fn read_total_cpu(reader: &mut procfs::ProcReader) -> Result<procfs::CpuStat> {
+    Ok(reader
+        .read_stat()
+        .context("Failed to read procfs")?
+        .total_cpu
+        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))?)
+}
+
+fn now_monotonic() -> u64 {
+    let mut time = libc::timespec {
+        tv_sec: 0,
+        tv_nsec: 0,
+    };
+    let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) };
+    assert!(ret == 0);
+    time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64
+}
+
+fn clear_map(map: &mut libbpf_rs::Map) {
+    // XXX: libbpf_rs has some design flaw that make it impossible to
+    // delete while iterating despite it being safe so we alias it here
+    let deleter: &mut libbpf_rs::Map = unsafe { &mut *(map as *mut _) };
+    for key in map.keys() {
+        let _ = deleter.delete(&key);
+    }
+}
+
+#[derive(Debug)]
+struct TaskLoad {
+    runnable_for: u64,
+    load: f64,
+}
+
+#[derive(Debug)]
+struct TaskInfo {
+    pid: i32,
+    dom_mask: u64,
+    migrated: Cell<bool>,
+}
+
+struct LoadBalancer<'a, 'b, 'c> {
+    maps: AtroposMapsMut<'a>,
+    task_loads: &'b mut BTreeMap<i32, TaskLoad>,
+    nr_doms: usize,
+    load_decay_factor: f64,
+
+    tasks_by_load: Vec<BTreeMap<OrderedFloat<f64>, TaskInfo>>,
+    load_avg: f64,
+    dom_loads: Vec<f64>,
+
+    imbal: Vec<f64>,
+    doms_to_push: BTreeMap<OrderedFloat<f64>, u32>,
+    doms_to_pull: BTreeMap<OrderedFloat<f64>, u32>,
+
+    nr_lb_data_errors: &'c mut u64,
+}
+
+impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
+    const LOAD_IMBAL_HIGH_RATIO: f64 = 0.10;
+    const LOAD_IMBAL_REDUCTION_MIN_RATIO: f64 = 0.1;
+    const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50;
+
+    fn new(
+        maps: AtroposMapsMut<'a>,
+        task_loads: &'b mut BTreeMap<i32, TaskLoad>,
+        nr_doms: usize,
+        load_decay_factor: f64,
+        nr_lb_data_errors: &'c mut u64,
+    ) -> Self {
+        Self {
+            maps,
+            task_loads,
+            nr_doms,
+            load_decay_factor,
+
+            tasks_by_load: (0..nr_doms).map(|_| BTreeMap::<_, _>::new()).collect(),
+            load_avg: 0f64,
+            dom_loads: vec![0.0; nr_doms],
+
+            imbal: vec![0.0; nr_doms],
+            doms_to_pull: BTreeMap::new(),
+            doms_to_push: BTreeMap::new(),
+
+            nr_lb_data_errors,
+        }
+    }
+
+    fn read_task_loads(&mut self, period: Duration) -> Result<()> {
+        let now_mono = now_monotonic();
+        let task_data = self.maps.task_data();
+        let mut this_task_loads = BTreeMap::<i32, TaskLoad>::new();
+        let mut load_sum = 0.0f64;
+        self.dom_loads = vec![0f64; self.nr_doms];
+
+        for key in task_data.keys() {
+            if let Some(task_ctx_vec) = task_data
+                .lookup(&key, libbpf_rs::MapFlags::ANY)
+                .context("Failed to lookup task_data")?
+            {
+                let task_ctx =
+                    unsafe { &*(task_ctx_vec.as_slice().as_ptr() as *const atropos_sys::task_ctx) };
+                let pid = i32::from_ne_bytes(
+                    key.as_slice()
+                        .try_into()
+                        .context("Invalid key length in task_data map")?,
+                );
+
+                let (this_at, this_for, weight) = unsafe {
+                    (
+                        std::ptr::read_volatile(&task_ctx.runnable_at as *const u64),
+                        std::ptr::read_volatile(&task_ctx.runnable_for as *const u64),
+                        std::ptr::read_volatile(&task_ctx.weight as *const u32),
+                    )
+                };
+
+                let (mut delta, prev_load) = match self.task_loads.get(&pid) {
+                    Some(prev) => (this_for - prev.runnable_for, Some(prev.load)),
+                    None => (this_for, None),
+                };
+
+                // Non-zero this_at indicates that the task is currently
+                // runnable. Note that we read runnable_at and runnable_for
+                // without any synchronization and there is a small window
+                // where we end up misaccounting. While this can cause
+                // temporary error, it's unlikely to cause any noticeable
+                // misbehavior especially given the load value clamping.
+                if this_at > 0 && this_at < now_mono {
+                    delta += now_mono - this_at;
+                }
+
+                delta = delta.min(period.as_nanos() as u64);
+                let this_load = (weight as f64 * delta as f64 / period.as_nanos() as f64)
+                    .clamp(0.0, weight as f64);
+
+                let this_load = match prev_load {
+                    Some(prev_load) => {
+                        prev_load * self.load_decay_factor
+                            + this_load * (1.0 - self.load_decay_factor)
+                    }
+                    None => this_load,
+                };
+
+                this_task_loads.insert(
+                    pid,
+                    TaskLoad {
+                        runnable_for: this_for,
+                        load: this_load,
+                    },
+                );
+
+                load_sum += this_load;
+                self.dom_loads[task_ctx.dom_id as usize] += this_load;
+                // Only record pids that are eligible for load balancing
+                if task_ctx.dom_mask == (1u64 << task_ctx.dom_id) {
+                    continue;
+                }
+                self.tasks_by_load[task_ctx.dom_id as usize].insert(
+                    OrderedFloat(this_load),
+                    TaskInfo {
+                        pid,
+                        dom_mask: task_ctx.dom_mask,
+                        migrated: Cell::new(false),
+                    },
+                );
+            }
+        }
+
+        self.load_avg = load_sum / self.nr_doms as f64;
+        *self.task_loads = this_task_loads;
+        Ok(())
+    }
+
+    // To balance dom loads we identify doms with lower and higher load than average
+    fn calculate_dom_load_balance(&mut self) -> Result<()> {
+        for (dom, dom_load) in self.dom_loads.iter().enumerate() {
+            let imbal = dom_load - self.load_avg;
+            if imbal.abs() >= self.load_avg * Self::LOAD_IMBAL_HIGH_RATIO {
+                if imbal > 0f64 {
+                    self.doms_to_push.insert(OrderedFloat(imbal), dom as u32);
+                } else {
+                    self.doms_to_pull.insert(OrderedFloat(-imbal), dom as u32);
+                }
+                self.imbal[dom] = imbal;
+            }
+        }
+        Ok(())
+    }
+
+    // Find the first candidate pid which hasn't already been migrated and
+    // can run in @pull_dom.
+    fn find_first_candidate<'d, I>(tasks_by_load: I, pull_dom: u32) -> Option<(f64, &'d TaskInfo)>
+    where
+        I: IntoIterator<Item = (&'d OrderedFloat<f64>, &'d TaskInfo)>,
+    {
+        match tasks_by_load
+            .into_iter()
+            .skip_while(|(_, task)| task.migrated.get() || task.dom_mask & (1 << pull_dom) == 0)
+            .next()
+        {
+            Some((OrderedFloat(load), task)) => Some((*load, task)),
+            None => None,
+        }
+    }
+
+    fn pick_victim(
+        &self,
+        (push_dom, to_push): (u32, f64),
+        (pull_dom, to_pull): (u32, f64),
+    ) -> Option<(&TaskInfo, f64)> {
+        let to_xfer = to_pull.min(to_push);
+
+        trace!(
+            "considering dom {}@{:.2} -> {}@{:.2}",
+            push_dom,
+            to_push,
+            pull_dom,
+            to_pull
+        );
+
+        let calc_new_imbal = |xfer: f64| (to_push - xfer).abs() + (to_pull - xfer).abs();
+
+        trace!(
+            "to_xfer={:.2} tasks_by_load={:?}",
+            to_xfer,
+            &self.tasks_by_load[push_dom as usize]
+        );
+
+        // We want to pick a task to transfer from push_dom to pull_dom to
+        // maximize the reduction of load imbalance between the two. IOW,
+        // pick a task which has the closest load value to $to_xfer that can
+        // be migrated. Find such task by locating the first migratable task
+        // while scanning left from $to_xfer and the counterpart while
+        // scanning right and picking the better of the two.
+        let (load, task, new_imbal) = match (
+            Self::find_first_candidate(
+                self.tasks_by_load[push_dom as usize]
+                    .range((Unbounded, Included(&OrderedFloat(to_xfer))))
+                    .rev(),
+                pull_dom,
+            ),
+            Self::find_first_candidate(
+                self.tasks_by_load[push_dom as usize]
+                    .range((Included(&OrderedFloat(to_xfer)), Unbounded)),
+                pull_dom,
+            ),
+        ) {
+            (None, None) => return None,
+            (Some((load, task)), None) | (None, Some((load, task))) => {
+                (load, task, calc_new_imbal(load))
+            }
+            (Some((load0, task0)), Some((load1, task1))) => {
+                let (new_imbal0, new_imbal1) = (calc_new_imbal(load0), calc_new_imbal(load1));
+                if new_imbal0 <= new_imbal1 {
+                    (load0, task0, new_imbal0)
+                } else {
+                    (load1, task1, new_imbal1)
+                }
+            }
+        };
+
+        // If the best candidate can't reduce the imbalance, there's nothing
+        // to do for this pair.
+        let old_imbal = to_push + to_pull;
+        if old_imbal * (1.0 - Self::LOAD_IMBAL_REDUCTION_MIN_RATIO) < new_imbal {
+            trace!(
+                "skipping pid {}, dom {} -> {} won't improve imbal {:.2} -> {:.2}",
+                task.pid,
+                push_dom,
+                pull_dom,
+                old_imbal,
+                new_imbal
+            );
+            return None;
+        }
+
+        trace!(
+            "migrating pid {}, dom {} -> {}, imbal={:.2} -> {:.2}",
+            task.pid,
+            push_dom,
+            pull_dom,
+            old_imbal,
+            new_imbal,
+        );
+
+        Some((task, load))
+    }
+
+    // Actually execute the load balancing. Concretely this writes pid -> dom
+    // entries into the lb_data map for bpf side to consume.
+    fn load_balance(&mut self) -> Result<()> {
+        clear_map(self.maps.lb_data());
+
+        trace!("imbal={:?}", &self.imbal);
+        trace!("doms_to_push={:?}", &self.doms_to_push);
+        trace!("doms_to_pull={:?}", &self.doms_to_pull);
+
+        // Push from the most imbalanced to least.
+        while let Some((OrderedFloat(mut to_push), push_dom)) = self.doms_to_push.pop_last() {
+            let push_max = self.dom_loads[push_dom as usize] * Self::LOAD_IMBAL_PUSH_MAX_RATIO;
+            let mut pushed = 0f64;
+
+            // Transfer tasks from push_dom to reduce imbalance.
+            loop {
+                let last_pushed = pushed;
+
+                // Pull from the most imbalaned to least.
+                let mut doms_to_pull = BTreeMap::<_, _>::new();
+                std::mem::swap(&mut self.doms_to_pull, &mut doms_to_pull);
+                let mut pull_doms = doms_to_pull.into_iter().rev().collect::<Vec<(_, _)>>();
+
+                for (to_pull, pull_dom) in pull_doms.iter_mut() {
+                    if let Some((task, load)) =
+                        self.pick_victim((push_dom, to_push), (*pull_dom, f64::from(*to_pull)))
+                    {
+                        // Execute migration.
+                        task.migrated.set(true);
+                        to_push -= load;
+                        *to_pull -= load;
+                        pushed += load;
+
+                        // Ask BPF code to execute the migration.
+                        let pid = task.pid;
+                        let cpid = (pid as libc::pid_t).to_ne_bytes();
+                        if let Err(e) = self.maps.lb_data().update(
+                            &cpid,
+                            &pull_dom.to_ne_bytes(),
+                            libbpf_rs::MapFlags::NO_EXIST,
+                        ) {
+                            warn!(
+                                "Failed to update lb_data map for pid={} error={:?}",
+                                pid, &e
+                            );
+                            *self.nr_lb_data_errors += 1;
+                        }
+
+                        // Always break after a successful migration so that
+                        // the pulling domains are always considered in the
+                        // descending imbalance order.
+                        break;
+                    }
+                }
+
+                pull_doms
+                    .into_iter()
+                    .map(|(k, v)| self.doms_to_pull.insert(k, v))
+                    .count();
+
+                // Stop repeating if nothing got transferred or pushed enough.
+                if pushed == last_pushed || pushed >= push_max {
+                    break;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+struct Scheduler<'a> {
+    skel: AtroposSkel<'a>,
+    struct_ops: Option<libbpf_rs::Link>,
+
+    nr_cpus: usize,
+    nr_doms: usize,
+    load_decay_factor: f64,
+    balance_load: bool,
+
+    proc_reader: procfs::ProcReader,
+
+    prev_at: SystemTime,
+    prev_total_cpu: procfs::CpuStat,
+    task_loads: BTreeMap<i32, TaskLoad>,
+
+    nr_lb_data_errors: u64,
+}
+
+impl<'a> Scheduler<'a> {
+    // Returns Vec of cpuset for each dq and a vec of dq for each cpu
+    fn parse_cpusets(
+        cpumasks: &[String],
+        nr_cpus: usize,
+    ) -> Result<(Vec<BitVec<u64, Lsb0>>, Vec<i32>)> {
+        if cpumasks.len() > atropos_sys::MAX_DOMS as usize {
+            bail!(
+                "Number of requested DSQs ({}) is greater than MAX_DOMS ({})",
+                cpumasks.len(),
+                atropos_sys::MAX_DOMS
+            );
+        }
+        let mut cpus = vec![-1i32; nr_cpus];
+        let mut cpusets =
+            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
+        for (dq, cpumask) in cpumasks.iter().enumerate() {
+            let hex_str = {
+                let mut tmp_str = cpumask
+                    .strip_prefix("0x")
+                    .unwrap_or(cpumask)
+                    .replace('_', "");
+                if tmp_str.len() % 2 != 0 {
+                    tmp_str = "0".to_string() + &tmp_str;
+                }
+                tmp_str
+            };
+            let byte_vec = hex::decode(&hex_str)
+                .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?;
+
+            for (index, &val) in byte_vec.iter().rev().enumerate() {
+                let mut v = val;
+                while v != 0 {
+                    let lsb = v.trailing_zeros() as usize;
+                    v &= !(1 << lsb);
+                    let cpu = index * 8 + lsb;
+                    if cpu > nr_cpus {
+                        bail!(
+                            concat!(
+                                "Found cpu ({}) in cpumask ({}) which is larger",
+                                " than the number of cpus on the machine ({})"
+                            ),
+                            cpu,
+                            cpumask,
+                            nr_cpus
+                        );
+                    }
+                    if cpus[cpu] != -1 {
+                        bail!(
+                            "Found cpu ({}) with dq ({}) but also in cpumask ({})",
+                            cpu,
+                            cpus[cpu],
+                            cpumask
+                        );
+                    }
+                    cpus[cpu] = dq as i32;
+                    cpusets[dq].set(cpu, true);
+                }
+            }
+            cpusets[dq].set_uninitialized(false);
+        }
+
+        for (cpu, &dq) in cpus.iter().enumerate() {
+            if dq < 0 {
+                bail!(
+                "Cpu {} not assigned to any dq. Make sure it is covered by some --cpumasks argument.",
+                cpu
+            );
+            }
+        }
+
+        Ok((cpusets, cpus))
+    }
+
+    // Returns Vec of cpuset for each dq and a vec of dq for each cpu
+    fn cpusets_from_cache(
+        level: u32,
+        nr_cpus: usize,
+    ) -> Result<(Vec<BitVec<u64, Lsb0>>, Vec<i32>)> {
+        let mut cpu_to_cache = vec![]; // (cpu_id, cache_id)
+        let mut cache_ids = BTreeSet::<u32>::new();
+        let mut nr_not_found = 0;
+
+        // Build cpu -> cache ID mapping.
+        for cpu in 0..nr_cpus {
+            let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level);
+            let id = match std::fs::read_to_string(&path) {
+                Ok(val) => val
+                    .trim()
+                    .parse::<u32>()
+                    .with_context(|| format!("Failed to parse {:?}'s content {:?}", &path, &val))?,
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    nr_not_found += 1;
+                    0
+                }
+                Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
+            };
+
+            cpu_to_cache.push(id);
+            cache_ids.insert(id);
+        }
+
+        if nr_not_found > 1 {
+            warn!(
+                "Couldn't determine level {} cache IDs for {} CPUs out of {}, assigned to cache ID 0",
+                level, nr_not_found, nr_cpus
+            );
+        }
+
+        // Cache IDs may have holes. Assign consecutive domain IDs to
+        // existing cache IDs.
+        let mut cache_to_dom = BTreeMap::<u32, u32>::new();
+        let mut nr_doms = 0;
+        for cache_id in cache_ids.iter() {
+            cache_to_dom.insert(*cache_id, nr_doms);
+            nr_doms += 1;
+        }
+
+        if nr_doms > atropos_sys::MAX_DOMS {
+            bail!(
+                "Total number of doms {} is greater than MAX_DOMS ({})",
+                nr_doms,
+                atropos_sys::MAX_DOMS
+            );
+        }
+
+        // Build and return dom -> cpumask and cpu -> dom mappings.
+        let mut cpusets =
+            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; nr_doms as usize];
+        let mut cpu_to_dom = vec![];
+
+        for cpu in 0..nr_cpus {
+            let dom_id = cache_to_dom[&cpu_to_cache[cpu]];
+            cpusets[dom_id as usize].set(cpu, true);
+            cpu_to_dom.push(dom_id as i32);
+        }
+
+        Ok((cpusets, cpu_to_dom))
+    }
+
+    fn init(opts: &Opts) -> Result<Self> {
+        // Open the BPF prog first for verification.
+        let mut skel_builder = AtroposSkelBuilder::default();
+        skel_builder.obj_builder.debug(opts.verbose > 0);
+        let mut skel = skel_builder.open().context("Failed to open BPF program")?;
+
+        let nr_cpus = libbpf_rs::num_possible_cpus().unwrap();
+        if nr_cpus > atropos_sys::MAX_CPUS as usize {
+            bail!(
+                "nr_cpus ({}) is greater than MAX_CPUS ({})",
+                nr_cpus,
+                atropos_sys::MAX_CPUS
+            );
+        }
+
+        // Initialize skel according to @opts.
+        let (cpusets, cpus) = if opts.cpumasks.len() > 0 {
+            Self::parse_cpusets(&opts.cpumasks, nr_cpus)?
+        } else {
+            Self::cpusets_from_cache(opts.cache_level, nr_cpus)?
+        };
+        let nr_doms = cpusets.len();
+        skel.rodata().nr_doms = nr_doms as u32;
+        skel.rodata().nr_cpus = nr_cpus as u32;
+
+        for (cpu, dom) in cpus.iter().enumerate() {
+            skel.rodata().cpu_dom_id_map[cpu] = *dom as u32;
+        }
+
+        for (dom, cpuset) in cpusets.iter().enumerate() {
+            let raw_cpuset_slice = cpuset.as_raw_slice();
+            let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
+            let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpuset_slice.len());
+            left.clone_from_slice(cpuset.as_raw_slice());
+            let cpumask_str = dom_cpumask_slice
+                .iter()
+                .take((nr_cpus + 63) / 64)
+                .rev()
+                .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x));
+            info!(
+                "DOM[{:02}] cpumask{} ({} cpus)",
+                dom,
+                &cpumask_str,
+                cpuset.count_ones()
+            );
+        }
+
+        skel.rodata().slice_us = opts.slice_us;
+        skel.rodata().kthreads_local = opts.kthreads_local;
+        skel.rodata().fifo_sched = opts.fifo_sched;
+        skel.rodata().switch_partial = opts.partial;
+        skel.rodata().greedy_threshold = opts.greedy_threshold;
+
+        // Attach.
+        let mut skel = skel.load().context("Failed to load BPF program")?;
+        skel.attach().context("Failed to attach BPF program")?;
+        let struct_ops = Some(
+            skel.maps_mut()
+                .atropos()
+                .attach_struct_ops()
+                .context("Failed to attach atropos struct ops")?,
+        );
+        info!("Atropos Scheduler Attached");
+
+        // Other stuff.
+        let mut proc_reader = procfs::ProcReader::new();
+        let prev_total_cpu = read_total_cpu(&mut proc_reader)?;
+
+        Ok(Self {
+            skel,
+            struct_ops, // should be held to keep it attached
+
+            nr_cpus,
+            nr_doms,
+            load_decay_factor: opts.load_decay_factor.clamp(0.0, 0.99),
+            balance_load: !opts.no_load_balance,
+
+            proc_reader,
+
+            prev_at: SystemTime::now(),
+            prev_total_cpu,
+            task_loads: BTreeMap::new(),
+
+            nr_lb_data_errors: 0,
+        })
+    }
+
+    fn get_cpu_busy(&mut self) -> Result<f64> {
+        let total_cpu = read_total_cpu(&mut self.proc_reader)?;
+        let busy = match (&self.prev_total_cpu, &total_cpu) {
+            (
+                procfs::CpuStat {
+                    user_usec: Some(prev_user),
+                    nice_usec: Some(prev_nice),
+                    system_usec: Some(prev_system),
+                    idle_usec: Some(prev_idle),
+                    iowait_usec: Some(prev_iowait),
+                    irq_usec: Some(prev_irq),
+                    softirq_usec: Some(prev_softirq),
+                    stolen_usec: Some(prev_stolen),
+                    guest_usec: _,
+                    guest_nice_usec: _,
+                },
+                procfs::CpuStat {
+                    user_usec: Some(curr_user),
+                    nice_usec: Some(curr_nice),
+                    system_usec: Some(curr_system),
+                    idle_usec: Some(curr_idle),
+                    iowait_usec: Some(curr_iowait),
+                    irq_usec: Some(curr_irq),
+                    softirq_usec: Some(curr_softirq),
+                    stolen_usec: Some(curr_stolen),
+                    guest_usec: _,
+                    guest_nice_usec: _,
+                },
+            ) => {
+                let idle_usec = curr_idle - prev_idle;
+                let iowait_usec = curr_iowait - prev_iowait;
+                let user_usec = curr_user - prev_user;
+                let system_usec = curr_system - prev_system;
+                let nice_usec = curr_nice - prev_nice;
+                let irq_usec = curr_irq - prev_irq;
+                let softirq_usec = curr_softirq - prev_softirq;
+                let stolen_usec = curr_stolen - prev_stolen;
+
+                let busy_usec =
+                    user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
+                let total_usec = idle_usec + busy_usec + iowait_usec;
+                busy_usec as f64 / total_usec as f64
+            }
+            _ => {
+                bail!("Some procfs stats are not populated!");
+            }
+        };
+
+        self.prev_total_cpu = total_cpu;
+        Ok(busy)
+    }
+
+    fn read_bpf_stats(&mut self) -> Result<Vec<u64>> {
+        let mut maps = self.skel.maps_mut();
+        let stats_map = maps.stats();
+        let mut stats: Vec<u64> = Vec::new();
+        let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.nr_cpus];
+
+        for stat in 0..atropos_sys::stat_idx_ATROPOS_NR_STATS {
+            let cpu_stat_vec = stats_map
+                .lookup_percpu(&(stat as u32).to_ne_bytes(), libbpf_rs::MapFlags::ANY)
+                .with_context(|| format!("Failed to lookup stat {}", stat))?
+                .expect("per-cpu stat should exist");
+            let sum = cpu_stat_vec
+                .iter()
+                .map(|val| {
+                    u64::from_ne_bytes(
+                        val.as_slice()
+                            .try_into()
+                            .expect("Invalid value length in stat map"),
+                    )
+                })
+                .sum();
+            stats_map
+                .update_percpu(
+                    &(stat as u32).to_ne_bytes(),
+                    &zero_vec,
+                    libbpf_rs::MapFlags::ANY,
+                )
+                .context("Failed to zero stat")?;
+            stats.push(sum);
+        }
+        Ok(stats)
+    }
+
+    fn report(
+        &self,
+        stats: &Vec<u64>,
+        cpu_busy: f64,
+        processing_dur: Duration,
+        load_avg: f64,
+        dom_loads: &Vec<f64>,
+        imbal: &Vec<f64>,
+    ) {
+        let stat = |idx| stats[idx as usize];
+        let total = stat(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_PINNED)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_LAST_TASK);
+
+        info!(
+            "cpu={:6.1} load_avg={:7.1} bal={} task_err={} lb_data_err={} proc={:?}ms",
+            cpu_busy * 100.0,
+            load_avg,
+            stats[atropos_sys::stat_idx_ATROPOS_STAT_LOAD_BALANCE as usize],
+            stats[atropos_sys::stat_idx_ATROPOS_STAT_TASK_GET_ERR as usize],
+            self.nr_lb_data_errors,
+            processing_dur.as_millis(),
+        );
+
+        let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0;
+
+        info!(
+            "tot={:6} wsync={:4.1} prev_idle={:4.1} pin={:4.1} dir={:4.1} dq={:4.1} greedy={:4.1}",
+            total,
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PINNED),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY),
+        );
+
+        for i in 0..self.nr_doms {
+            info!(
+                "DOM[{:02}] load={:7.1} to_pull={:7.1} to_push={:7.1}",
+                i,
+                dom_loads[i],
+                if imbal[i] < 0.0 { -imbal[i] } else { 0.0 },
+                if imbal[i] > 0.0 { imbal[i] } else { 0.0 },
+            );
+        }
+    }
+
+    fn step(&mut self) -> Result<()> {
+        let started_at = std::time::SystemTime::now();
+        let bpf_stats = self.read_bpf_stats()?;
+        let cpu_busy = self.get_cpu_busy()?;
+
+        let mut lb = LoadBalancer::new(
+            self.skel.maps_mut(),
+            &mut self.task_loads,
+            self.nr_doms,
+            self.load_decay_factor,
+            &mut self.nr_lb_data_errors,
+        );
+
+        lb.read_task_loads(started_at.duration_since(self.prev_at)?)?;
+        lb.calculate_dom_load_balance()?;
+
+        if self.balance_load {
+            lb.load_balance()?;
+        }
+
+        // Extract fields needed for reporting and drop lb to release
+        // mutable borrows.
+        let (load_avg, dom_loads, imbal) = (lb.load_avg, lb.dom_loads, lb.imbal);
+
+        self.report(
+            &bpf_stats,
+            cpu_busy,
+            std::time::SystemTime::now().duration_since(started_at)?,
+            load_avg,
+            &dom_loads,
+            &imbal,
+        );
+
+        self.prev_at = started_at;
+        Ok(())
+    }
+
+    fn read_bpf_exit_type(&mut self) -> i32 {
+        unsafe { std::ptr::read_volatile(&self.skel.bss().exit_type as *const _) }
+    }
+
+    fn report_bpf_exit_type(&mut self) -> Result<()> {
+        // Report msg if EXT_OPS_EXIT_ERROR.
+        match self.read_bpf_exit_type() {
+            0 => Ok(()),
+            etype if etype == 2 => {
+                let cstr = unsafe { CStr::from_ptr(self.skel.bss().exit_msg.as_ptr() as *const _) };
+                let msg = cstr
+                    .to_str()
+                    .context("Failed to convert exit msg to string")
+                    .unwrap();
+                bail!("BPF exit_type={} msg={}", etype, msg);
+            }
+            etype => {
+                info!("BPF exit_type={}", etype);
+                Ok(())
+            }
+        }
+    }
+}
+
+impl<'a> Drop for Scheduler<'a> {
+    fn drop(&mut self) {
+        if let Some(struct_ops) = self.struct_ops.take() {
+            drop(struct_ops);
+        }
+    }
+}
+
+fn main() -> Result<()> {
+    let opts = Opts::parse();
+
+    let llv = match opts.verbose {
+        0 => simplelog::LevelFilter::Info,
+        1 => simplelog::LevelFilter::Debug,
+        _ => simplelog::LevelFilter::Trace,
+    };
+    let mut lcfg = simplelog::ConfigBuilder::new();
+    lcfg.set_time_level(simplelog::LevelFilter::Error)
+        .set_location_level(simplelog::LevelFilter::Off)
+        .set_target_level(simplelog::LevelFilter::Off)
+        .set_thread_level(simplelog::LevelFilter::Off);
+    simplelog::TermLogger::init(
+        llv,
+        lcfg.build(),
+        simplelog::TerminalMode::Stderr,
+        simplelog::ColorChoice::Auto,
+    )?;
+
+    let shutdown = Arc::new(AtomicBool::new(false));
+    let shutdown_clone = shutdown.clone();
+    ctrlc::set_handler(move || {
+        shutdown_clone.store(true, Ordering::Relaxed);
+    })
+    .context("Error setting Ctrl-C handler")?;
+
+    let mut sched = Scheduler::init(&opts)?;
+
+    while !shutdown.load(Ordering::Relaxed) && sched.read_bpf_exit_type() == 0 {
+        std::thread::sleep(Duration::from_secs_f64(opts.interval));
+        sched.step()?;
+    }
+
+    sched.report_bpf_exit_type()
+}

From 41e0d706939b7246b5a261db115824eaeb92d611 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 13 Apr 2023 18:10:39 -0500
Subject: [PATCH 033/304] Atropos: Use slice_ns in BPF instead of slice_us

Atropos takes a slice_us command line parameter to tune the scheduler's
slice length. This value is passed directly to BPF, where we always
multiply the value by 1000. To simplify the BPF code, we can just make
the variable slice_ns in BPF, and multiply it by 1000 in rust when the
program is first opened.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 10 +++++-----
 tools/sched_ext/atropos/src/main.rs           |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 3905a403e9406..18df15ab192aa 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -66,7 +66,7 @@ const volatile bool switch_partial;
 const volatile __u32 greedy_threshold;
 
 /* base slice duration */
-const volatile __u64 slice_us = 20000;
+const volatile __u64 slice_ns = SCX_SLICE_DFL;
 
 /*
  * Exit info
@@ -340,12 +340,12 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
 
 	if (task_ctx->dispatch_local) {
 		task_ctx->dispatch_local = false;
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_us * 1000, enq_flags);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
 
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, task_ctx->dom_id, slice_us * 1000,
+		scx_bpf_dispatch(p, task_ctx->dom_id, slice_ns,
 				 enq_flags);
 	} else {
 		u64 vtime = p->scx.dsq_vtime;
@@ -362,8 +362,8 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
 		 * Limit the amount of budget that an idling task can accumulate
 		 * to one slice.
 		 */
-		if (vtime_before(vtime, domc->vtime_now - slice_us * 1000))
-			vtime = domc->vtime_now - slice_us * 1000;
+		if (vtime_before(vtime, domc->vtime_now - slice_ns))
+			vtime = domc->vtime_now - slice_ns;
 
 		scx_bpf_dispatch_vtime(p, task_ctx->dom_id, SCX_SLICE_DFL, vtime,
 				       enq_flags);
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 0d313662f7136..ebf8ba35c4b28 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -658,7 +658,7 @@ impl<'a> Scheduler<'a> {
             );
         }
 
-        skel.rodata().slice_us = opts.slice_us;
+        skel.rodata().slice_ns = opts.slice_us * 1000;
         skel.rodata().kthreads_local = opts.kthreads_local;
         skel.rodata().fifo_sched = opts.fifo_sched;
         skel.rodata().switch_partial = opts.partial;

From 17c9f7e75c5591c6c320f544f576345d3bb149b3 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 13 Apr 2023 18:12:33 -0500
Subject: [PATCH 034/304] Atropos: Use slice_ns instead of SCX_SLICE_DFL

There are a few places in Atropos where we ignore the slice_ns variable
when calculating vtime, and dispatching tasks. This patch fixes that by
updating those callsites t use the slice_ns variable that's set by user
space (or set by default) rather than using SCX_SLICE_DFL.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 18df15ab192aa..0f9a3fcfcd9ee 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -365,7 +365,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
 		if (vtime_before(vtime, domc->vtime_now - slice_ns))
 			vtime = domc->vtime_now - slice_ns;
 
-		scx_bpf_dispatch_vtime(p, task_ctx->dom_id, SCX_SLICE_DFL, vtime,
+		scx_bpf_dispatch_vtime(p, task_ctx->dom_id, slice_ns, vtime,
 				       enq_flags);
 	}
 }
@@ -492,7 +492,7 @@ void BPF_STRUCT_OPS(atropos_stopping, struct task_struct *p, bool runnable)
 		return;
 
 	/* scale the execution time by the inverse of the weight and charge */
-	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+	p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
 }
 
 void BPF_STRUCT_OPS(atropos_quiescent, struct task_struct *p, u64 deq_flags)

From b9417e6e482d460da413114db97d6c11def27a12 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 14 Apr 2023 10:58:09 -0500
Subject: [PATCH 035/304] scx: Call ops.set_weight() with rq lock on
 scx_post_fork() path

There's currently a race when invoking ops.set_weight() that could cause
a warning to be emitted due to unexpected nesting. Most scx ops are
always called either with or without interrupts being enabled.
ops.set_weight() is a bit special in that it can be called both on the
post-fork path from scx_post_fork(), or from switching_to_scx() /
reweight_task_scx(), which are both called with the rq lock held and
interrupts disabled. This can cause the following warning to be emitted
if a timer interrupt is fired while executing the ops.set_weight()
callback on the post-fork path:

[ 4975.435822] invalid nesting current->scx.kf_mask=0x20 mask=0x20
[ 4975.447670] WARNING: CPU: 33 PID: 431940 at kernel/sched/ext.c:213 select_task_rq_scx+0x173/0x190
...
[ 4975.615777] RIP: 0010:select_task_rq_scx+0x173/0x190
[ 4975.625721] Code: 3d 8e ef 57 02 00 0f 85 f6 fe ff ff 8b b2 e0 02 00 00 48 c7 c7 20 91 70 82 ba 20 00 00 00 c6 05 6f ef 57 02 01 e8 64 22 b9 00 <0f> 0b e9 d1 fe ff ff 41 83 8d d0 02 00 00
 04 41 89 dc e9 ff fe ff
[ 4975.663290] RSP: 0018:ffffc90000f8ce98 EFLAGS: 00010082
[ 4975.673746] RAX: 0000000000000000 RBX: 0000000000000021 RCX: 0000000000000027
[ 4975.688026] RDX: ffff88903fc60a08 RSI: 0000000000000001 RDI: ffff88903fc60a00
[ 4975.702305] RBP: 0000000000000003 R08: ffffffff83564c08 R09: 0000000000000003
[ 4975.716581] R10: ffffffff82e64c20 R11: ffffffff833e4c20 R12: 0000000000000008
[ 4975.730858] R13: ffff8884ef9f6500 R14: ffff8884ef9f6e80 R15: ffffc9000183be18
[ 4975.745134] FS:  00007fc5ad8cd740(0000) GS:ffff88903fc40000(0000) knlGS:0000000000000000
[ 4975.761325] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4975.772822] CR2: 000056284b9183b8 CR3: 00000004fe043001 CR4: 00000000007706e0
[ 4975.787100] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4975.801376] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4975.815654] PKRU: 55555554
[ 4975.821065] Call Trace:
[ 4975.825955]  <IRQ>
[ 4975.829978]  try_to_wake_up+0xe3/0x500
[ 4975.837484]  ? __hrtimer_init+0xa0/0xa0
[ 4975.845160]  hrtimer_wakeup+0x1e/0x30
[ 4975.852485]  __hrtimer_run_queues+0x137/0x270
[ 4975.861206]  hrtimer_interrupt+0x10e/0x230
[ 4975.869404]  __sysvec_apic_timer_interrupt+0x4e/0xc0
[ 4975.879339]  sysvec_apic_timer_interrupt+0x6d/0x90
[ 4975.888931]  </IRQ>
[ 4975.893130]  <TASK>
...
[ 4975.897327]  asm_sysvec_apic_timer_interrupt+0x16/0x20
[ 4975.907610] RIP: 0010:__htab_map_lookup_elem+0x65/0xa0
[ 4975.917904] Code: 89 ee 49 c1 e6 05 4d 03 b4 24 a0 01 00 00 4d 8b 26 41 f6 c4 01 74 0c eb 33 4d 8b 24 24 41 f6 c4 01 75 29 45 3b 7c 24 28 75 ef <49> 8d 7c 24 30 48 89 da 48 89 ee e8 eb 30
 55 00 85 c0 75 db 5b 4c
[ 4975.955470] RSP: 0018:ffffc90025437ca0 EFLAGS: 00000246
[ 4975.965923] RAX: 0000000000100000 RBX: 0000000000000004 RCX: 00000000a8623740
[ 4975.980198] RDX: 00000000c9edf4bd RSI: 00000000af4c9372 RDI: ffffc90025437cec
[ 4975.994476] RBP: ffffc90025437cec R08: ffffffff82e12f38 R09: ffffffff82e12f38
[ 4976.008752] R10: 0000000000000008 R11: 0000000000000008 R12: ffffc900347349a8
[ 4976.023027] R13: 000000000001525f R14: ffffc90031b05be0 R15: 00000000a061525f
[ 4976.037309]  bpf_prog_4d41ada04df44ed7_atropos_set_weight+0x3a/0x7a
[ 4976.049861]  ? refresh_scx_weight+0x9b/0xe0
[ 4976.058239]  ? scx_post_fork+0x25/0x120
[ 4976.065925]  ? ktime_get_with_offset+0x4c/0xb0
[ 4976.074821]  ? copy_process+0x15fb/0x1af0
[ 4976.082853]  ? kernel_clone+0x9b/0x3b0
[ 4976.090355]  ? __do_sys_clone+0x66/0x90
[ 4976.098042]  ? do_syscall_64+0x35/0x80
[ 4976.105547]  ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[ 4976.116005]  </TASK>

Let's fix this by grabbing the rq lock on the scx_post_fork() path
before calling refresh_scx_weight(). We also update the logic to set the
struct task_struct * weight before calling ops.enable(), and then to
invoke the ops.set_weight() callback only after calling ops.enable().
This is done to avoid invoking a callback on the task before the
scheduler's been told that it's fully enabled.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5d115c047992a..052c7857aad3f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2263,6 +2263,13 @@ static void scx_ops_disable_task(struct task_struct *p)
 	}
 }
 
+static void set_task_scx_weight(struct task_struct *p)
+{
+	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+
+	p->scx.weight = sched_weight_to_cgroup(weight);
+}
+
 /**
  * refresh_scx_weight - Refresh a task's ext weight
  * @p: task to refresh ext weight for
@@ -2275,9 +2282,8 @@ static void scx_ops_disable_task(struct task_struct *p)
  */
 static void refresh_scx_weight(struct task_struct *p)
 {
-	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
-
-	p->scx.weight = sched_weight_to_cgroup(weight);
+	lockdep_assert_rq_held(task_rq(p));
+	set_task_scx_weight(p);
 	if (SCX_HAS_OP(set_weight))
 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
 }
@@ -2305,14 +2311,21 @@ int scx_fork(struct task_struct *p)
 
 void scx_post_fork(struct task_struct *p)
 {
-	refresh_scx_weight(p);
-
 	if (scx_enabled()) {
 		struct rq_flags rf;
 		struct rq *rq;
 
 		rq = task_rq_lock(p, &rf);
+		/*
+		 * Set the weight manually before calling ops.enable() so that
+		 * the scheduler doesn't see a stale value if they inspect the
+		 * task struct. We'll invoke ops.set_weight() afterwards, as it
+		 * would be odd to receive a callback on the task before we
+		 * tell the scheduler that it's been fully enabled.
+		 */
+		set_task_scx_weight(p);
 		scx_ops_enable_task(p);
+		refresh_scx_weight(p);
 		task_rq_unlock(rq, p, &rf);
 	}
 

From 1aa1a1ee61fa466f44a09a464f90ab940250d2bb Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sun, 9 Apr 2023 15:10:21 -0500
Subject: [PATCH 036/304] scx: Further document example schedulers

We've had some confusion on both the upstream list, and in issues open
on the sched_ext GitHub repo, of folks who don't know whether an example
sched_ext scheduler is or isn't expected to perform well. We have header
comments in the .bpf.c files for these schedulers, but folks may not see
them. Let's add a README file to the tools/sched_ext repo which
enumerates the various example schedulers, and tells people how to
compile them.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/README | 264 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 tools/sched_ext/README

diff --git a/tools/sched_ext/README b/tools/sched_ext/README
new file mode 100644
index 0000000000000..4a748aaacb20e
--- /dev/null
+++ b/tools/sched_ext/README
@@ -0,0 +1,264 @@
+		============================
+		SCHED_EXT EXAMPLE SCHEDULERS
+		============================
+
+Introduction
+============
+
+This directory contains a number of example sched_ext schedulers. These
+schedulers are meant to provide examples of different types of schedulers
+that can be built using sched_ext, and illustrate how various features of
+sched_ext can be used.
+
+Some of the examples are performant, production-ready schedulers. That is, for
+the correct workload and with the correct tuning, they may be deployed in a
+production environment with acceptable or possibly even improved performance.
+Others are just examples that in practice, would not provide acceptable
+performance (though they could be improved to get there).
+
+This README will describe these example schedulers, including describing the
+types of workloads or scenarios they're designed to accommodate, and whether or
+not they're production ready. For more details on any of these schedulers,
+please see the header comment in their .bpf.c file.
+
+
+Compiling the examples
+======================
+
+There are a few toolchain dependencies for compiling the example schedulers.
+
+Toolchain dependencies
+----------------------
+
+1. clang >= 17.0
+
+The schedulers are BPF programs, and therefore must be compiled with clang. gcc
+is actively working on adding a BPF backend compiler as well, but are still
+missing some features such as BTF type tags which are necessary for using
+kptrs.
+
+clang 17.0 has not yet been released, so you'll need to compile it yourself if
+you want to compile the benchmarks.
+
+2. rustup nightly
+
+Atropos's user space load balancing component is written in Rust, and uses
+nightly features. You'll need to use the nightly build from rustup in order to
+compile it.
+
+There are other requirements as well, such as make, but these are the main /
+non-trivial ones.
+
+Compiling the schedulers
+------------------------
+
+Once you have your toolchain setup, you can compile the schedulers as follows:
+
+$ make CC=clang LLVM=1 -j
+
+See Documentation/scheduler/sched-ext.rst for a description of the config
+options required to compile a sched_ext kernel.
+
+Schedulers
+==========
+
+This section lists, in alphabetical order, all of the current example
+schedulers.
+
+--------------------------------------------------------------------------------
+
+Atropos
+-------
+
+Overview
+~~~~~~~~
+
+A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
+scheduler does a simple round robin in each domain, and the user space portion
+(written in Rust) calculates the load factor of each domain, and informs BPF of
+how tasks should be load balanced accordingly.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+Atropos is designed to be flexible, and accommodate different architectures and
+workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
+as well as how Atropos should partition the system into scheduling domains, can
+be tuned to achieve the optimal configuration for any given system or workload.
+
+Production Ready?
+~~~~~~~~~~~~~~~~~
+
+Yes. If tuned correctly, Atropos should be performant across various CPU
+architectures and workloads.
+
+--------------------------------------------------------------------------------
+
+scx_example_central
+-------------------
+
+Overview
+~~~~~~~~
+
+A "central" scheduler where scheduling decisions are made from a single CPU.
+This scheduler illustrates how scheduling decisions can be dispatched from a
+single CPU, allowing other cores to run with infinite slices, without timer
+ticks, and without having to incur the overhead of making scheduling decisions.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+This scheduler could theoretically be useful for any workload that benefits
+from minimizing scheduling overhead and timer ticks. An example of where this
+could be particularly useful is running VMs, where running with infinite slices
+and no timer ticks allows the VM to avoid unnecessary expensive vmexits.
+
+Production Ready?
+~~~~~~~~~~~~~~~~~
+
+Not yet. While tasks are run with an infinite slice (SCX_SLICE_INF), they're
+preempted every 20ms in a timer callback. The scheduler also puts the core
+schedling logic inside of the central / scheduling CPU's ops.dispatch() path,
+and does not yet have any kind of priority mechanism.
+
+--------------------------------------------------------------------------------
+
+scx_example_flatcg
+------------------
+
+Overview
+~~~~~~~~
+
+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
+weight-based cgroup CPU control by flattening the cgroup hierarchy into a
+single layer, by compounding the active weight share at each level. The effect
+of this is a much more performant CPU controller, which does not need to
+descend down cgroup trees in order to properly compute a cgroup's share.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+This scheduler could be useful for any typical workload requiring a CPU
+controller, but which cannot tolerate the higher overheads of the fair CPU
+controller.
+
+Production Ready?
+~~~~~~~~~~~~~~~~~
+
+Yes, though the scheduler (currently) does not adequately accommodate
+thundering herds of cgroups. If, for example, many cgroups which are nested
+behind a low-priority cgroup were to wake up around the same time, they may be
+able to consume more CPU cycles than they are entitled to.
+
+--------------------------------------------------------------------------------
+
+scx_example_pair
+----------------
+
+Overview
+~~~~~~~~
+
+A sibling scheduler which ensures that tasks will only ever be co-located on a
+physical core if they're in the same cgroup. It illustrates how a scheduling
+policy could be implemented to mitigate CPU bugs, such as L1TF, and also shows
+how some useful kfuncs such as scx_bpf_kick_cpu() can be utilized.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+While this scheduler is only meant to be used to illustrate certain sched_ext
+features, with a bit more work (e.g. by adding some form of priority handling
+inside and across cgroups), it could have been used as a way to quickly
+mitigate L1TF before core scheduling was implemented and rolled out.
+
+Production Ready?
+~~~~~~~~~~~~~~~~~
+
+No
+
+--------------------------------------------------------------------------------
+
+scx_example_qmap
+----------------
+
+Overview
+~~~~~~~~
+
+Another simple, yet slightly more complex scheduler that provides an example of
+a basic weighted FIFO queuing policy. It also provides examples of some common
+useful BPF features, such as sleepable per-task storage allocation in the
+ops.prep_enable() callback, and using the BPF_MAP_TYPE_QUEUE map type to
+enqueue tasks. It also illustrates how core-sched support could be implemented.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+Purely used to illustrate sched_ext features.
+
+**Production Ready?**
+
+No
+
+--------------------------------------------------------------------------------
+
+scx_example_simple
+------------------
+
+Overview
+~~~~~~~~
+
+A simple scheduler that provides an example of a minimal sched_ext scheduler.
+scx_example_simple can be run in either global weighted vtime mode, or FIFO
+mode.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+Though very simple, this scheduler should perform reasonably well on
+single-socket CPUs with a uniform L3 cache topology. Note that while running in
+global FIFO mode may work well for some workloads, saturating threads can
+easily drown out inactive ones.
+
+Production Ready?
+~~~~~~~~~~~~~~~~~
+
+This scheduler could be used in a production environment, assuming the hardware
+constraints enumerated above, and assuming the workload can accommodate a
+simple scheduling policy.
+
+--------------------------------------------------------------------------------
+
+scx_example_userland
+--------------------
+
+Overview
+~~~~~~~~
+
+A simple weighted vtime scheduler where all scheduling decisions take place in
+user space. This is in contrast to Atropos, where load balancing lives in user
+space, but scheduling decisions are still made in the kernel.
+
+Typical Use Case
+~~~~~~~~~~~~~~~~
+
+There are many advantages to writing schedulers in user space. For example, you
+can use a debugger, you can write the scheduler in Rust, and you can use data
+structures bundled with your favorite library.
+
+On the other hand, user space scheduling can be hard to get right. You can
+potentially deadlock due to not scheduling a task that's required for the
+scheduler itself to make forward progress (though the sched_ext watchdog will
+protect the system by unloading your scheduler after a timeout if that
+happens). You also have to bootstrap some communication protocol between the
+kernel and user space.
+
+A more robust solution to this would be building a user space scheduling
+framework that abstracts much of this complexity away from you.
+
+Production Ready?
+~~~~~~~~~~~~~~~~~
+
+No. This scheduler uses an ordered list for vtime scheduling, and is stricly
+less performant than just using something like `scx_example_simple`. It is
+purely meant to illustrate that it's possible to build a user space scheduler
+on top of sched_ext.

From d9988b4a552e15bb8f626f9e06f20b38361d5128 Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 10 May 2023 11:42:56 -0700
Subject: [PATCH 037/304] atropos: Remove static libclang dependency

Most distributions provide a shared lib version of libclang, so building
atropos is not as easy as it should be. #6 is an example of this being
confusing to other developers.

We have no particular dependency on libclang being static, so this can
be configured more conventionally.

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 tools/sched_ext/atropos/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sched_ext/atropos/Cargo.toml b/tools/sched_ext/atropos/Cargo.toml
index 7462a836d53dd..1e47d86fc2cf4 100644
--- a/tools/sched_ext/atropos/Cargo.toml
+++ b/tools/sched_ext/atropos/Cargo.toml
@@ -21,7 +21,7 @@ ordered-float = "3.4.0"
 simplelog = "0.12.0"
 
 [build-dependencies]
-bindgen = { version = "0.61.0", features = ["logging", "static"], default-features = false }
+bindgen = { version = "0.61.0" }
 libbpf-cargo = "0.13.0"
 
 [features]

From 8ca3ab74d1558e3813710219522821ef76571f4e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 11 May 2023 10:28:33 -0500
Subject: [PATCH 038/304] scx: Fix scx_example_userland missing NULL check

In commit d02c48fa1139 ("bpf: Make struct task_struct an RCU-safe
type"), bpf_task_acquire() was updated to be KF_RET_NULL. Eishun pointed
out that the userland example scheduler no longer loads, and it's
because the usersched_task() helper function in scx_example_userland
originally used bpf_get_current_task_btf() to avoid having to do a NULL
check in the caller, but this no longer works due to bpf_task_acquire()
being NULL-able. Let's just remove that weird logic that used
bpf_get_current_task_btf(). When bpf_assert() lands, we can just use
that to simplify instead.

Fixes issue #9

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_example_userland.bpf.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
index a089bc6bbe868..c28c1099d1885 100644
--- a/tools/sched_ext/scx_example_userland.bpf.c
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -97,17 +97,8 @@ static struct task_struct *usersched_task(void)
 	 * Should never happen -- the usersched task should always be managed
 	 * by sched_ext.
 	 */
-	if (!p) {
+	if (!p)
 		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
-		/*
-		 * We should never hit this path, and we error out of the
-		 * scheduler above just in case, so the scheduler will soon be
-		 * be evicted regardless. So as to simplify the logic in the
-		 * caller to not have to check for NULL, return an acquired
-		 * reference to the current task here rather than NULL.
-		 */
-		return bpf_task_acquire(bpf_get_current_task_btf());
-	}
 
 	return p;
 }
@@ -147,8 +138,10 @@ static void dispatch_user_scheduler(void)
 
 	usersched_needed = false;
 	p = usersched_task();
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-	bpf_task_release(p);
+	if (p) {
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
 }
 
 static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)

From 93ebf513425583b4b468cca7ab51890eacd98163 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 18 May 2023 12:24:43 -0500
Subject: [PATCH 039/304] scx: Auto-release example scheduler struct_ops on
 program exit

When a BPF scheduler user-space program exits, the struct_ops map that
it attaches to to load the BPF program is not automatically detached.
This can cause the map and progs to remain loaded, which in turn will
cause future attempts to load a new scheduling program to fail.

In commit 8d1608d70927 ("libbpf: Create a bpf_link in
bpf_map__attach_struct_ops()"), Kui-feng updated
bpf_map__attach_struct_ops() to create an actual link for the struct_ops
map, which in turn makes it automatically close the map when the owning
program exits. This patch updates the example schedulers to leverage
this by updating their sections to SEC(".struct_ops.link").

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 2 +-
 tools/sched_ext/scx_example_central.bpf.c     | 2 +-
 tools/sched_ext/scx_example_flatcg.bpf.c      | 2 +-
 tools/sched_ext/scx_example_pair.bpf.c        | 2 +-
 tools/sched_ext/scx_example_qmap.bpf.c        | 2 +-
 tools/sched_ext/scx_example_simple.bpf.c      | 2 +-
 tools/sched_ext/scx_example_userland.bpf.c    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 0f9a3fcfcd9ee..0c36dd4918ca4 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -723,7 +723,7 @@ void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
 	exit_type = ei->type;
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops atropos = {
 	.select_cpu = (void *)atropos_select_cpu,
 	.enqueue = (void *)atropos_enqueue,
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
index 4cec04b4c2ede..f44b9365a1778 100644
--- a/tools/sched_ext/scx_example_central.bpf.c
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -314,7 +314,7 @@ void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops central_ops = {
 	/*
 	 * We are offloading all scheduling decisions to the central CPU and
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
index f6078b9a681fe..cf5d96617a6ab 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -853,7 +853,7 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops flatcg_ops = {
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
index 279efe58b777b..078bdd94c9877 100644
--- a/tools/sched_ext/scx_example_pair.bpf.c
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -613,7 +613,7 @@ void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops pair_ops = {
 	.enqueue		= (void *)pair_enqueue,
 	.dispatch		= (void *)pair_dispatch,
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 579ab21ae4036..68871031d5cee 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -384,7 +384,7 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops qmap_ops = {
 	.select_cpu		= (void *)qmap_select_cpu,
 	.enqueue		= (void *)qmap_enqueue,
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
index 4bccca3e20470..db71837eb566d 100644
--- a/tools/sched_ext/scx_example_simple.bpf.c
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -117,7 +117,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
 	.running		= (void *)simple_running,
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
index c28c1099d1885..b5551e24f7834 100644
--- a/tools/sched_ext/scx_example_userland.bpf.c
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops userland_ops = {
 	.select_cpu		= (void *)userland_select_cpu,
 	.enqueue		= (void *)userland_enqueue,

From f3c39e63ec65bfc2c45c67b3879971e8a48f63dc Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 19 May 2023 08:01:08 -0500
Subject: [PATCH 040/304] Revert "scx: Auto-release example scheduler
 struct_ops on program exit"

This seems to break the programs. Not sure why I didn't run into this on
local testing, but let's revert so it doesn't block people as in
https://github.com/sched-ext/sched_ext/issues/12

This reverts commit 93ebf513425583b4b468cca7ab51890eacd98163.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 2 +-
 tools/sched_ext/scx_example_central.bpf.c     | 2 +-
 tools/sched_ext/scx_example_flatcg.bpf.c      | 2 +-
 tools/sched_ext/scx_example_pair.bpf.c        | 2 +-
 tools/sched_ext/scx_example_qmap.bpf.c        | 2 +-
 tools/sched_ext/scx_example_simple.bpf.c      | 2 +-
 tools/sched_ext/scx_example_userland.bpf.c    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 0c36dd4918ca4..0f9a3fcfcd9ee 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -723,7 +723,7 @@ void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
 	exit_type = ei->type;
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops atropos = {
 	.select_cpu = (void *)atropos_select_cpu,
 	.enqueue = (void *)atropos_enqueue,
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
index f44b9365a1778..4cec04b4c2ede 100644
--- a/tools/sched_ext/scx_example_central.bpf.c
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -314,7 +314,7 @@ void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops central_ops = {
 	/*
 	 * We are offloading all scheduling decisions to the central CPU and
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
index cf5d96617a6ab..f6078b9a681fe 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -853,7 +853,7 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops flatcg_ops = {
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
index 078bdd94c9877..279efe58b777b 100644
--- a/tools/sched_ext/scx_example_pair.bpf.c
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -613,7 +613,7 @@ void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops pair_ops = {
 	.enqueue		= (void *)pair_enqueue,
 	.dispatch		= (void *)pair_dispatch,
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 68871031d5cee..579ab21ae4036 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -384,7 +384,7 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops qmap_ops = {
 	.select_cpu		= (void *)qmap_select_cpu,
 	.enqueue		= (void *)qmap_enqueue,
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
index db71837eb566d..4bccca3e20470 100644
--- a/tools/sched_ext/scx_example_simple.bpf.c
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -117,7 +117,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
 	.running		= (void *)simple_running,
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
index b5551e24f7834..c28c1099d1885 100644
--- a/tools/sched_ext/scx_example_userland.bpf.c
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops.link")
+SEC(".struct_ops")
 struct sched_ext_ops userland_ops = {
 	.select_cpu		= (void *)userland_select_cpu,
 	.enqueue		= (void *)userland_enqueue,

From f6b15c3d44f610bd52f9b13adc7754bf6dcc45ca Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 22 May 2023 17:46:34 -0500
Subject: [PATCH 041/304] scx: Enable .link struct_ops in sched_ext

In order to support .struct_ops.link maps for sched_ext schedulers, the
sched_ext bpf_struct_ops structure must implement the .update() and
.validate() callbacks. sched_ext cannot support atomically registering
and unregistering a scheduler, so we don't add support for the .update()
operation.

With this, sched_ext now supports loading schedulers that will be
auto-detached when the user space program exits.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 052c7857aad3f..04a74691b8803 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3555,6 +3555,23 @@ static int bpf_scx_init(struct btf *btf)
 	return 0;
 }
 
+static int bpf_scx_update(void *kdata, void *old_kdata)
+{
+	/*
+	 * sched_ext does not support updating the actively-loaded BPF
+	 * scheduler, as registering a BPF scheduler can always fail if the
+	 * scheduler returns an error code for e.g. ops.init(),
+	 * ops.prep_enable(), etc. Similarly, we can always race with
+	 * unregistration happening elsewhere, such as with sysrq.
+	 */
+	return -EOPNOTSUPP;
+}
+
+static int bpf_scx_validate(void *kdata)
+{
+	return 0;
+}
+
 /* "extern" to avoid sparse warning, only used in this file */
 extern struct bpf_struct_ops bpf_sched_ext_ops;
 
@@ -3565,6 +3582,8 @@ struct bpf_struct_ops bpf_sched_ext_ops = {
 	.check_member = bpf_scx_check_member,
 	.init_member = bpf_scx_init_member,
 	.init = bpf_scx_init,
+	.update = bpf_scx_update,
+	.validate = bpf_scx_validate,
 	.name = "sched_ext_ops",
 };
 

From 37316dd3bf188beb76fca20bd6f7231f79c30a1f Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 18 May 2023 12:24:43 -0500
Subject: [PATCH 042/304] scx: Auto-release example scheduler struct_ops on
 program exit

When a BPF scheduler user-space program exits, the struct_ops map that
it attaches to to load the BPF program is not automatically detached.
This can cause the map and progs to remain loaded, which in turn will
cause future attempts to load a new scheduling program to fail.

In commit 8d1608d70927 ("libbpf: Create a bpf_link in
bpf_map__attach_struct_ops()"), Kui-feng updated
bpf_map__attach_struct_ops() to create an actual link for the struct_ops
map, which in turn makes it automatically close the map when the owning
program exits. This patch updates the example schedulers to leverage
this by updating their sections to SEC(".struct_ops.link").

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 2 +-
 tools/sched_ext/scx_example_central.bpf.c     | 2 +-
 tools/sched_ext/scx_example_flatcg.bpf.c      | 2 +-
 tools/sched_ext/scx_example_pair.bpf.c        | 2 +-
 tools/sched_ext/scx_example_qmap.bpf.c        | 2 +-
 tools/sched_ext/scx_example_simple.bpf.c      | 2 +-
 tools/sched_ext/scx_example_userland.bpf.c    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 0f9a3fcfcd9ee..0c36dd4918ca4 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -723,7 +723,7 @@ void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
 	exit_type = ei->type;
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops atropos = {
 	.select_cpu = (void *)atropos_select_cpu,
 	.enqueue = (void *)atropos_enqueue,
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
index 4cec04b4c2ede..f44b9365a1778 100644
--- a/tools/sched_ext/scx_example_central.bpf.c
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -314,7 +314,7 @@ void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops central_ops = {
 	/*
 	 * We are offloading all scheduling decisions to the central CPU and
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
index f6078b9a681fe..cf5d96617a6ab 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -853,7 +853,7 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops flatcg_ops = {
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
index 279efe58b777b..078bdd94c9877 100644
--- a/tools/sched_ext/scx_example_pair.bpf.c
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -613,7 +613,7 @@ void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops pair_ops = {
 	.enqueue		= (void *)pair_enqueue,
 	.dispatch		= (void *)pair_dispatch,
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 579ab21ae4036..68871031d5cee 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -384,7 +384,7 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops qmap_ops = {
 	.select_cpu		= (void *)qmap_select_cpu,
 	.enqueue		= (void *)qmap_enqueue,
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
index 4bccca3e20470..db71837eb566d 100644
--- a/tools/sched_ext/scx_example_simple.bpf.c
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -117,7 +117,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
 	.running		= (void *)simple_running,
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
index c28c1099d1885..b5551e24f7834 100644
--- a/tools/sched_ext/scx_example_userland.bpf.c
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
 	uei_record(&uei, ei);
 }
 
-SEC(".struct_ops")
+SEC(".struct_ops.link")
 struct sched_ext_ops userland_ops = {
 	.select_cpu		= (void *)userland_select_cpu,
 	.enqueue		= (void *)userland_enqueue,

From c1f56c1c0800163b6ca32b0ac5c6deea2ede141c Mon Sep 17 00:00:00 2001
From: Eishun Kondoh <ekondoh@juniper.net>
Date: Wed, 31 May 2023 17:19:59 +0900
Subject: [PATCH 043/304] sched_ext: Documentation: scx_bpf_dispatch() requires
 a slice

Signed-off-by: shun159 <dreamdiagnosis@gmail.com>
---
 Documentation/scheduler/sched-ext.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 84c30b44f104c..2ef2f409f4a66 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -88,9 +88,9 @@ scheduler.
     void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
     {
             if (enq_flags & SCX_ENQ_LOCAL)
-                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, enq_flags);
+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
             else
-                    scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, enq_flags);
+                    scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
     }
 
     void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)

From aa6d3d3dd150328f8de582278232a872dceb698e Mon Sep 17 00:00:00 2001
From: shun159 <dreamdiagnosis@gmail.com>
Date: Sun, 4 Jun 2023 16:46:51 +0900
Subject: [PATCH 044/304] sched_ext: Update code comment as scx_bpf_switch_all
 doesn't require 'into_scx'

Signed-off-by: Eishun Kondoh <dreamdiagnosis@gmail.com>
---
 kernel/sched/ext.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 04a74691b8803..beca68b9a6106 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3702,11 +3702,10 @@ __diag_ignore_all("-Wmissing-prototypes",
 
 /**
  * scx_bpf_switch_all - Switch all tasks into SCX
- * @into_scx: switch direction
  *
- * If @into_scx is %true, all existing and future non-dl/rt tasks are switched
- * to SCX. If %false, only tasks which have %SCHED_EXT explicitly set are put on
- * SCX. The actual switching is asynchronous. Can be called from ops.init().
+ * Switch all existing and future non-dl/rt tasks to SCX.
+ * This can only be called from ops.init(), and actual switching
+ * is performed asynchronously.
  */
 void scx_bpf_switch_all(void)
 {

From cdc8d0f42481115a50e0fe535f1aa32c5449c00b Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 8 Jun 2023 16:15:19 -0500
Subject: [PATCH 045/304] scx: Update schedulers to use new bpf_rbtree
 semantics

bpf_rbtree_add() is now bpf_rbtree_add_impl(), and bpf_rbtree_remove()
can return NULL.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_common.bpf.h         |  7 +++++--
 tools/sched_ext/scx_example_flatcg.bpf.c | 10 ++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index e56de9dc86f28..aa7e3bb3c0c85 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -135,8 +135,11 @@ struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
 struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
 struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 				      struct bpf_rb_node *node) __ksym;
-void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-		    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym;
+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+			void *meta, __u64 off) __ksym;
+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
+
 struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
 
 /* task */
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
index cf5d96617a6ab..855a33383f9e5 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -587,6 +587,16 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
 	bpf_spin_unlock(&cgv_tree_lock);
 
+	if (!rb_node) {
+		/*
+		 * This should never happen. bpf_rbtree_first() was called
+		 * above while the tree lock was held, so the node should
+		 * always be present.
+		 */
+		scx_bpf_error("node could not be removed");
+		return true;
+	}
+
 	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
 	cgid = cgv_node->cgid;
 

From af8e2290d72693001924d18a1e8725110c7b6c46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:53 -1000
Subject: [PATCH 046/304] SCX: Remove scx_has_idle_cpus

This caches whether there are any idle CPUs which we used to test a lot more
frequently. Now, the only remaining user is WAKE_SYNC path in
scx_pick_idle_cpu(), which is not high-frequency enough to warrant track and
cache the test result. Remove it.
---
 kernel/sched/ext.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index beca68b9a6106..12295daaee045 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -126,7 +126,6 @@ static struct {
 	cpumask_var_t smt;
 } idle_masks CL_ALIGNED_IF_ONSTACK;
 
-static bool __cacheline_aligned_in_smp scx_has_idle_cpus;
 #endif	/* CONFIG_SMP */
 
 /* for %SCX_KICK_WAIT */
@@ -1926,13 +1925,7 @@ void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
 
 static bool test_and_clear_cpu_idle(int cpu)
 {
-	if (cpumask_test_and_clear_cpu(cpu, idle_masks.cpu)) {
-		if (cpumask_empty(idle_masks.cpu))
-			scx_has_idle_cpus = false;
-		return true;
-	} else {
-		return false;
-	}
+	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
 }
 
 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed)
@@ -1978,7 +1971,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 	 * local DSQ of the waker.
 	 */
 	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
-	    scx_has_idle_cpus && !(current->flags & PF_EXITING)) {
+	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING)) {
 		cpu = smp_processor_id();
 		if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
 			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
@@ -2045,7 +2038,6 @@ static void reset_idle_masks(void)
 	/* consider all cpus idle, should converge to the actual state quickly */
 	cpumask_setall(idle_masks.cpu);
 	cpumask_setall(idle_masks.smt);
-	scx_has_idle_cpus = true;
 }
 
 void __scx_update_idle(struct rq *rq, bool idle)
@@ -2061,8 +2053,6 @@ void __scx_update_idle(struct rq *rq, bool idle)
 
 	if (idle) {
 		cpumask_set_cpu(cpu, idle_masks.cpu);
-		if (!scx_has_idle_cpus)
-			scx_has_idle_cpus = true;
 
 		/*
 		 * idle_masks.smt handling is racy but that's fine as it's only
@@ -2075,9 +2065,6 @@ void __scx_update_idle(struct rq *rq, bool idle)
 		cpumask_or(idle_masks.smt, idle_masks.smt, sib_mask);
 	} else {
 		cpumask_clear_cpu(cpu, idle_masks.cpu);
-		if (scx_has_idle_cpus && cpumask_empty(idle_masks.cpu))
-			scx_has_idle_cpus = false;
-
 		cpumask_andnot(idle_masks.smt, idle_masks.smt, sib_mask);
 	}
 }

From a108f05f6207583387ee2400e3efa0d9d338fda9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:53 -1000
Subject: [PATCH 047/304] SCX: Implement SCX_PICK_IDLE_CPU_WHOLE

Add @flags to scx_bpf_pick_idle_cpu() and implement SCX_PICK_IDLE_CPU_WHOLE
flag which makes the function try to pick a CPU whose SMT siblings are also
idle. This will be used to improve idle CPU selection.
---
 include/linux/sched/ext.h                     |  1 -
 kernel/sched/ext.c                            | 20 +++++++++++--------
 kernel/sched/ext.h                            |  4 ++++
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c |  4 ++--
 tools/sched_ext/scx_common.bpf.h              |  2 +-
 tools/sched_ext/scx_example_qmap.bpf.c        |  4 ++--
 tools/sched_ext/scx_example_userland.bpf.c    |  2 +-
 7 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 61837aac8ab3e..9982e08522335 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -366,7 +366,6 @@ struct sched_ext_ops {
 	 * - scx_bpf_select_cpu_dfl()
 	 * - scx_bpf_test_and_clear_cpu_idle()
 	 * - scx_bpf_pick_idle_cpu()
-	 * - scx_bpf_any_idle_cpu()
 	 *
 	 * The user also must implement ops.select_cpu() as the default
 	 * implementation relies on scx_bpf_select_cpu_dfl().
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 12295daaee045..fd0aa55a57e63 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1928,7 +1928,7 @@ static bool test_and_clear_cpu_idle(int cpu)
 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
 }
 
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed)
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 {
 	int cpu;
 
@@ -1948,6 +1948,9 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed)
 			else
 				cpumask_andnot(idle_masks.smt, idle_masks.smt, cpumask_of(cpu));
 		} else {
+			if (flags & SCX_PICK_IDLE_CPU_WHOLE)
+				return -EBUSY;
+
 			cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
 			if (cpu >= nr_cpu_ids)
 				return -EBUSY;
@@ -1988,7 +1991,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 	if (p->nr_cpus_allowed == 1)
 		return prev_cpu;
 
-	cpu = scx_pick_idle_cpu(p->cpus_ptr);
+	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
 	if (cpu >= 0) {
 		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
 		return cpu;
@@ -2084,7 +2087,7 @@ static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
 #else /* !CONFIG_SMP */
 
 static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed) { return -EBUSY; }
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
 static void reset_idle_masks(void) {}
 
 #endif /* CONFIG_SMP */
@@ -3992,7 +3995,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
 /**
  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
  * @cpu: cpu to kick
- * @flags: SCX_KICK_* flags
+ * @flags: %SCX_KICK_* flags
  *
  * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
  * trigger rescheduling on a busy CPU. This can be called from any online
@@ -4081,21 +4084,22 @@ bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
 /**
  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
  * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
  *
- * Pick and claim an idle cpu which is also in @cpus_allowed. Returns the picked
- * idle cpu number on success. -%EBUSY if no matching cpu was found.
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
  *
  * Unavailable if ops.update_idle() is implemented and
  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
  */
-s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed)
+s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 {
 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
 		scx_ops_error("built-in idle tracking is disabled");
 		return -EBUSY;
 	}
 
-	return scx_pick_idle_cpu(cpus_allowed);
+	return scx_pick_idle_cpu(cpus_allowed, flags);
 }
 
 /**
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 998b790b39288..7bc5e871f7fd1 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -91,6 +91,10 @@ enum scx_kick_flags {
 	SCX_KICK_WAIT		= 1LLU << 1,	/* wait for the CPU to be rescheduled */
 };
 
+enum scx_pick_idle_cpu_flags {
+	SCX_PICK_IDLE_CPU_WHOLE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
+};
+
 #ifdef CONFIG_SCHED_CLASS_EXT
 
 extern const struct sched_class ext_sched_class;
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 0c36dd4918ca4..af0ff06f0fd83 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -275,7 +275,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 		return -ENOENT;
 
 	/* If there is an eligible idle CPU, dispatch directly */
-	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask);
+	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
 	if (cpu >= 0) {
 		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
 		goto local;
@@ -332,7 +332,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
 			scx_bpf_error("Failed to get task_ctx->cpumask");
 			return;
 		}
-		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask);
+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
 
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, 0);
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index aa7e3bb3c0c85..7b82dd1f29752 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -61,7 +61,7 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
-s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 68871031d5cee..b6365df0fb640 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -117,7 +117,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		return prev_cpu;
 	}
 
-	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 	if (cpu >= 0)
 		return cpu;
 
@@ -191,7 +191,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		s32 cpu;
 
 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
-		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, 0);
 		return;
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
index b5551e24f7834..b62cce0b54e1b 100644
--- a/tools/sched_ext/scx_example_userland.bpf.c
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -122,7 +122,7 @@ s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
 			return prev_cpu;
 		}
 
-		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0) {
 			tctx->force_local = true;
 			return cpu;

From 09958d01f5487b9ea1e0bc61660cc02fed78adcc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:53 -1000
Subject: [PATCH 048/304] SCX: Prefer wholly idle CPUs in SMT machines

Use the newly added SCX_PICK_IDLE_CPU_WHOLE to make scx_select_cpu_dfl()
prefer CPUs which are wholly idle over prev_cpu that's only partially idle.

While at it, add SMT conditionals to scx_pick_idle_cpu().
---
 kernel/sched/ext.c | 54 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fd0aa55a57e63..b9cece8558349 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1932,7 +1932,9 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 {
 	int cpu;
 
-	do {
+retry:
+#ifdef CONFIG_SCHED_SMT
+	if (static_branch_likely(&sched_smt_present)) {
 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
 		if (cpu < nr_cpu_ids) {
 			const struct cpumask *sbm = topology_sibling_cpumask(cpu);
@@ -1947,17 +1949,22 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 				cpumask_andnot(idle_masks.smt, idle_masks.smt, sbm);
 			else
 				cpumask_andnot(idle_masks.smt, idle_masks.smt, cpumask_of(cpu));
-		} else {
-			if (flags & SCX_PICK_IDLE_CPU_WHOLE)
-				return -EBUSY;
-
-			cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
-			if (cpu >= nr_cpu_ids)
-				return -EBUSY;
+			goto found;
 		}
-	} while (!test_and_clear_cpu_idle(cpu));
 
-	return cpu;
+		if (flags & SCX_PICK_IDLE_CPU_WHOLE)
+			return -EBUSY;
+	}
+#endif
+	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
+	if (cpu >= nr_cpu_ids)
+		return -EBUSY;
+
+found:
+	if (test_and_clear_cpu_idle(cpu))
+		return cpu;
+	else
+		goto retry;
 }
 
 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
@@ -1982,14 +1989,33 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 		}
 	}
 
-	/* if the previous CPU is idle, dispatch directly to it */
-	if (test_and_clear_cpu_idle(prev_cpu)) {
-		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
+	if (p->nr_cpus_allowed == 1)
 		return prev_cpu;
+
+#ifdef CONFIG_SCHED_SMT
+	/*
+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
+	 * partially idle @prev_cpu.
+	 */
+	if (static_branch_likely(&sched_smt_present)) {
+		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
+		    test_and_clear_cpu_idle(prev_cpu)) {
+			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
+			return prev_cpu;
+		}
+
+		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CPU_WHOLE);
+		if (cpu >= 0) {
+			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
+			return cpu;
+		}
 	}
+#endif
 
-	if (p->nr_cpus_allowed == 1)
+	if (test_and_clear_cpu_idle(prev_cpu)) {
+		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
 		return prev_cpu;
+	}
 
 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
 	if (cpu >= 0) {

From 37cf5c6be96bdccd5d318ff87a28f2691afe380d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:53 -1000
Subject: [PATCH 049/304] SCX: simple, flatcg: Fix p->scx.dsq_vtime
 initialization

In simple and flatcg, p->scx.dsq_vtime wasn't initialized on load or when a
task is moved across cgroups. As vtimes start at zero, the bug is less
noticeable on the first load; however, on subsequent loads and after cgroup
migrations, some tasks may end up with vtime far into the future and stall
for extended period of time.

Re-init dsq_vtime on load and cgroup migrations.
---
 tools/sched_ext/scx_example_flatcg.bpf.c | 22 ++++++++++++++++++++++
 tools/sched_ext/scx_example_simple.bpf.c |  7 +++++++
 2 files changed, 29 insertions(+)

diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_example_flatcg.bpf.c
index 855a33383f9e5..e79f941d588d9 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_example_flatcg.bpf.c
@@ -753,6 +753,7 @@ s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
 	struct fcg_task_ctx *taskc;
+	struct fcg_cgrp_ctx *cgc;
 
 	/*
 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
@@ -764,6 +765,12 @@ s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
 		return -ENOMEM;
 
 	taskc->bypassed_at = 0;
+
+	if (!(cgc = find_cgrp_ctx(args->cgroup)))
+		return -ENOENT;
+
+	p->scx.dsq_vtime = cgc->tvtime_now;
+
 	return 0;
 }
 
@@ -851,6 +858,20 @@ void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
 	scx_bpf_destroy_dsq(cgid);
 }
 
+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
+		    struct cgroup *from, struct cgroup *to)
+{
+	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
+	s64 vtime_delta;
+
+	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
+	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
+		return;
+
+	vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
+	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
+}
+
 s32 BPF_STRUCT_OPS(fcg_init)
 {
 	if (!switch_partial)
@@ -875,6 +896,7 @@ struct sched_ext_ops flatcg_ops = {
 	.cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
 	.cgroup_init		= (void *)fcg_cgroup_init,
 	.cgroup_exit		= (void *)fcg_cgroup_exit,
+	.cgroup_move		= (void *)fcg_cgroup_move,
 	.init			= (void *)fcg_init,
 	.exit			= (void *)fcg_exit,
 	.flags			= SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING,
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_example_simple.bpf.c
index db71837eb566d..d4528c7da4500 100644
--- a/tools/sched_ext/scx_example_simple.bpf.c
+++ b/tools/sched_ext/scx_example_simple.bpf.c
@@ -105,6 +105,12 @@ void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
 	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
 }
 
+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p,
+		    struct scx_enable_args *args)
+{
+	p->scx.dsq_vtime = vtime_now;
+}
+
 s32 BPF_STRUCT_OPS(simple_init)
 {
 	if (!switch_partial)
@@ -122,6 +128,7 @@ struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
 	.running		= (void *)simple_running,
 	.stopping		= (void *)simple_stopping,
+	.enable			= (void *)simple_enable,
 	.init			= (void *)simple_init,
 	.exit			= (void *)simple_exit,
 	.name			= "simple",

From ac99d0e2817ce981e9ff0bf53b3c5a26bf59e3a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 050/304] SCX: atropos: A bunch of minor and cosmetic updates

* Drop short optoins for two less used options and always explicitly specify
  the char used for short option.

* Use more compact code form for looking up a map and checking its result.

* Make error messages more consistent.

* s/dq/dsq/g

* Other cosmetic & misc changes.
---
 include/linux/sched/ext.h                     |   4 +-
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 113 ++++++++----------
 tools/sched_ext/atropos/src/main.rs           |  82 +++++++------
 3 files changed, 101 insertions(+), 98 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 9982e08522335..26537b2f6c95c 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -370,8 +370,8 @@ struct sched_ext_ops {
 	 * The user also must implement ops.select_cpu() as the default
 	 * implementation relies on scx_bpf_select_cpu_dfl().
 	 *
-	 * If you keep the built-in idle tracking, specify the
-	 * %SCX_OPS_KEEP_BUILTIN_IDLE flag.
+	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+	 * tracking.
 	 */
 	void (*update_idle)(s32 cpu, bool idle);
 
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index af0ff06f0fd83..9c31b66758493 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -154,7 +154,7 @@ static bool task_set_dsq(struct task_ctx *task_ctx, struct task_struct *p,
 
 	old_domc = bpf_map_lookup_elem(&dom_ctx, &old_dom_id);
 	if (!old_domc) {
-		scx_bpf_error("No dom%u", old_dom_id);
+		scx_bpf_error("Failed to lookup old dom%u", old_dom_id);
 		return false;
 	}
 
@@ -162,13 +162,13 @@ static bool task_set_dsq(struct task_ctx *task_ctx, struct task_struct *p,
 
 	new_domc = bpf_map_lookup_elem(&dom_ctx, &new_dom_id);
 	if (!new_domc) {
-		scx_bpf_error("No dom%u", new_dom_id);
+		scx_bpf_error("Failed to lookup new dom%u", new_dom_id);
 		return false;
 	}
 
 	d_cpumask = new_domc->cpumask;
 	if (!d_cpumask) {
-		scx_bpf_error("Failed to get domain %u cpumask kptr",
+		scx_bpf_error("Failed to get dom%u cpumask kptr",
 			      new_dom_id);
 		return false;
 	}
@@ -197,19 +197,20 @@ static bool task_set_dsq(struct task_ctx *task_ctx, struct task_struct *p,
 s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 		   u32 wake_flags)
 {
-	s32 cpu;
-	pid_t pid = p->pid;
-	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	struct task_ctx *task_ctx;
 	struct bpf_cpumask *p_cpumask;
+	pid_t pid = p->pid;
+	s32 cpu;
 
-	if (!task_ctx)
+	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
+	    !(p_cpumask = task_ctx->cpumask))
 		return -ENOENT;
 
 	if (kthreads_local &&
 	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
 		cpu = prev_cpu;
 		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
-		goto local;
+		goto direct;
 	}
 
 	/*
@@ -234,7 +235,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 			}
 			d_cpumask = domc->cpumask;
 			if (!d_cpumask) {
-				scx_bpf_error("Failed to acquire domain %u cpumask kptr",
+				scx_bpf_error("Failed to acquire dom%u cpumask kptr",
 					      task_ctx->dom_id);
 				return prev_cpu;
 			}
@@ -250,7 +251,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 				cpu = bpf_get_smp_processor_id();
 				if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 					stat_add(ATROPOS_STAT_WAKE_SYNC, 1);
-					goto local;
+					goto direct;
 				}
 			}
 		}
@@ -260,25 +261,21 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
 		stat_add(ATROPOS_STAT_PREV_IDLE, 1);
 		cpu = prev_cpu;
-		goto local;
+		goto direct;
 	}
 
 	/* If only one core is allowed, dispatch */
 	if (p->nr_cpus_allowed == 1) {
 		stat_add(ATROPOS_STAT_PINNED, 1);
 		cpu = prev_cpu;
-		goto local;
+		goto direct;
 	}
 
-	p_cpumask = task_ctx->cpumask;
-	if (!p_cpumask)
-		return -ENOENT;
-
 	/* If there is an eligible idle CPU, dispatch directly */
 	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
 	if (cpu >= 0) {
 		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
-		goto local;
+		goto direct;
 	}
 
 	/*
@@ -293,26 +290,27 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 
 	return cpu;
 
-local:
+direct:
 	task_ctx->dispatch_local = true;
 	return cpu;
 }
 
-void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
+void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 {
+	struct task_ctx *task_ctx;
+	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
-	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
 	u32 *new_dom;
 
-	if (!task_ctx) {
-		scx_bpf_error("No task_ctx[%d]", pid);
+	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
+	    !(p_cpumask = task_ctx->cpumask)) {
+		scx_bpf_error("Failed to lookup task_ctx or cpumask");
 		return;
 	}
 
 	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
 	if (new_dom && *new_dom != task_ctx->dom_id &&
 	    task_set_dsq(task_ctx, p, *new_dom)) {
-		struct bpf_cpumask *p_cpumask;
 		s32 cpu;
 
 		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
@@ -327,13 +325,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
 			scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
 		}
 
-		p_cpumask = task_ctx->cpumask;
-		if (!p_cpumask) {
-			scx_bpf_error("Failed to get task_ctx->cpumask");
-			return;
-		}
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
-
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, 0);
 	}
@@ -354,7 +346,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
 
 		domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
 		if (!domc) {
-			scx_bpf_error("No dom[%u]", dom_id);
+			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 			return;
 		}
 
@@ -442,11 +434,11 @@ void BPF_STRUCT_OPS(atropos_dispatch, s32 cpu, struct task_struct *prev)
 
 void BPF_STRUCT_OPS(atropos_runnable, struct task_struct *p, u64 enq_flags)
 {
+	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
-	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
 
-	if (!task_ctx) {
-		scx_bpf_error("No task_ctx[%d]", pid);
+	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
+		scx_bpf_error("Failed to lookup task_ctx");
 		return;
 	}
 
@@ -465,14 +457,14 @@ void BPF_STRUCT_OPS(atropos_running, struct task_struct *p)
 
 	taskc = bpf_map_lookup_elem(&task_data, &pid);
 	if (!taskc) {
-		scx_bpf_error("No task_ctx[%d]", pid);
+		scx_bpf_error("Failed to lookup task_ctx");
 		return;
 	}
 	dom_id = taskc->dom_id;
 
 	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
 	if (!domc) {
-		scx_bpf_error("No dom[%u]", dom_id);
+		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 		return;
 	}
 
@@ -497,11 +489,11 @@ void BPF_STRUCT_OPS(atropos_stopping, struct task_struct *p, bool runnable)
 
 void BPF_STRUCT_OPS(atropos_quiescent, struct task_struct *p, u64 deq_flags)
 {
+	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
-	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
 
-	if (!task_ctx) {
-		scx_bpf_error("No task_ctx[%d]", pid);
+	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
+		scx_bpf_error("Failed to lookup task_ctx");
 		return;
 	}
 
@@ -511,11 +503,11 @@ void BPF_STRUCT_OPS(atropos_quiescent, struct task_struct *p, u64 deq_flags)
 
 void BPF_STRUCT_OPS(atropos_set_weight, struct task_struct *p, u32 weight)
 {
+	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
-	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
 
-	if (!task_ctx) {
-		scx_bpf_error("No task_ctx[%d]", pid);
+	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
+		scx_bpf_error("Failed to lookup task_ctx");
 		return;
 	}
 
@@ -576,17 +568,19 @@ static void task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 		dom_id = pick_task_domain(task_ctx, p, cpumask);
 
 	if (!task_set_dsq(task_ctx, p, dom_id))
-		scx_bpf_error("Failed to set domain %d for %s[%d]",
+		scx_bpf_error("Failed to set dom%d for %s[%d]",
 			      dom_id, p->comm, p->pid);
 }
 
 void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
 		    const struct cpumask *cpumask)
 {
+	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
-	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
-	if (!task_ctx) {
-		scx_bpf_error("No task_ctx[%d]", pid);
+
+	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
+		scx_bpf_error("Failed to lookup task_ctx for %s[%d]",
+			      p->comm, pid);
 		return;
 	}
 
@@ -725,19 +719,18 @@ void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
 
 SEC(".struct_ops.link")
 struct sched_ext_ops atropos = {
-	.select_cpu = (void *)atropos_select_cpu,
-	.enqueue = (void *)atropos_enqueue,
-	.dispatch = (void *)atropos_dispatch,
-	.runnable = (void *)atropos_runnable,
-	.running = (void *)atropos_running,
-	.stopping = (void *)atropos_stopping,
-	.quiescent = (void *)atropos_quiescent,
-	.set_weight = (void *)atropos_set_weight,
-	.set_cpumask = (void *)atropos_set_cpumask,
-	.prep_enable = (void *)atropos_prep_enable,
-	.disable = (void *)atropos_disable,
-	.init = (void *)atropos_init,
-	.exit = (void *)atropos_exit,
-	.flags = 0,
-	.name = "atropos",
+	.select_cpu		= (void *)atropos_select_cpu,
+	.enqueue		= (void *)atropos_enqueue,
+	.dispatch		= (void *)atropos_dispatch,
+	.runnable		= (void *)atropos_runnable,
+	.running		= (void *)atropos_running,
+	.stopping		= (void *)atropos_stopping,
+	.quiescent		= (void *)atropos_quiescent,
+	.set_weight		= (void *)atropos_set_weight,
+	.set_cpumask		= (void *)atropos_set_cpumask,
+	.prep_enable		= (void *)atropos_prep_enable,
+	.disable		= (void *)atropos_disable,
+	.init			= (void *)atropos_init,
+	.exit			= (void *)atropos_exit,
+	.name			= "atropos",
 };
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index ebf8ba35c4b28..ccc02f72353de 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -8,38 +8,51 @@ pub use atropos::*;
 pub mod atropos_sys;
 
 use std::cell::Cell;
-use std::collections::{BTreeMap, BTreeSet};
+use std::collections::BTreeMap;
+use std::collections::BTreeSet;
 use std::ffi::CStr;
-use std::ops::Bound::{Included, Unbounded};
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::ops::Bound::Included;
+use std::ops::Bound::Unbounded;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
+use std::time::SystemTime;
 
 use ::fb_procfs as procfs;
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::anyhow;
+use anyhow::bail;
+use anyhow::Context;
+use anyhow::Result;
 use bitvec::prelude::*;
 use clap::Parser;
-use log::{info, trace, warn};
+use log::info;
+use log::trace;
+use log::warn;
 use ordered_float::OrderedFloat;
 
 /// Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
 /// part does simple round robin in each domain and the userspace part
 /// calculates the load factor of each domain and tells the BPF part how to load
 /// balance the domains.
-
+///
 /// This scheduler demonstrates dividing scheduling logic between BPF and
 /// userspace and using rust to build the userspace part. An earlier variant of
 /// this scheduler was used to balance across six domains, each representing a
 /// chiplet in a six-chiplet AMD processor, and could match the performance of
 /// production setup using CFS.
+///
+/// WARNING: Atropos currenlty assumes that all domains have equal
+/// processing power and at similar distances from each other. This
+/// limitation will be removed in the future.
 #[derive(Debug, Parser)]
 struct Opts {
     /// Scheduling slice duration in microseconds.
-    #[clap(short, long, default_value = "20000")]
+    #[clap(short = 's', long, default_value = "20000")]
     slice_us: u64,
 
     /// Monitoring and load balance interval in seconds.
-    #[clap(short, long, default_value = "2.0")]
+    #[clap(short = 'i', long, default_value = "2.0")]
     interval: f64,
 
     /// Build domains according to how CPUs are grouped at this cache level
@@ -59,7 +72,7 @@ struct Opts {
     /// cpu will attempt to steal tasks from a domain with at least
     /// greedy_threshold tasks enqueued. These tasks aren't permanently
     /// stolen from the domain.
-    #[clap(short, long, default_value = "4")]
+    #[clap(short = 'g', long, default_value = "4")]
     greedy_threshold: u32,
 
     /// The load decay factor. Every interval, the existing load is decayed
@@ -67,32 +80,32 @@ struct Opts {
     /// 0.99]. The smaller the value, the more sensitive load calculation
     /// is to recent changes. When 0.0, history is ignored and the load
     /// value from the latest period is used directly.
-    #[clap(short, long, default_value = "0.5")]
+    #[clap(long, default_value = "0.5")]
     load_decay_factor: f64,
 
     /// Disable load balancing. Unless disabled, periodically userspace will
     /// calculate the load factor of each domain and instruct BPF which
     /// processes to move.
-    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    #[clap(long, action = clap::ArgAction::SetTrue)]
     no_load_balance: bool,
 
     /// Put per-cpu kthreads directly into local dsq's.
-    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
     kthreads_local: bool,
 
     /// Use FIFO scheduling instead of weighted vtime scheduling.
-    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
     fifo_sched: bool,
 
     /// If specified, only tasks which have their scheduling policy set to
     /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
     /// tasks are switched.
-    #[clap(short, long, action = clap::ArgAction::SetTrue)]
+    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
     partial: bool,
 
     /// Enable verbose output including libbpf details. Specify multiple
     /// times to increase verbosity.
-    #[clap(short, long, action = clap::ArgAction::Count)]
+    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
     verbose: u8,
 }
 
@@ -472,7 +485,7 @@ struct Scheduler<'a> {
 }
 
 impl<'a> Scheduler<'a> {
-    // Returns Vec of cpuset for each dq and a vec of dq for each cpu
+    // Returns Vec of cpuset for each dsq and a vec of dsq for each cpu
     fn parse_cpusets(
         cpumasks: &[String],
         nr_cpus: usize,
@@ -487,7 +500,7 @@ impl<'a> Scheduler<'a> {
         let mut cpus = vec![-1i32; nr_cpus];
         let mut cpusets =
             vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
-        for (dq, cpumask) in cpumasks.iter().enumerate() {
+        for (dsq, cpumask) in cpumasks.iter().enumerate() {
             let hex_str = {
                 let mut tmp_str = cpumask
                     .strip_prefix("0x")
@@ -520,32 +533,32 @@ impl<'a> Scheduler<'a> {
                     }
                     if cpus[cpu] != -1 {
                         bail!(
-                            "Found cpu ({}) with dq ({}) but also in cpumask ({})",
+                            "Found cpu ({}) with dsq ({}) but also in cpumask ({})",
                             cpu,
                             cpus[cpu],
                             cpumask
                         );
                     }
-                    cpus[cpu] = dq as i32;
-                    cpusets[dq].set(cpu, true);
+                    cpus[cpu] = dsq as i32;
+                    cpusets[dsq].set(cpu, true);
                 }
             }
-            cpusets[dq].set_uninitialized(false);
+            cpusets[dsq].set_uninitialized(false);
         }
 
-        for (cpu, &dq) in cpus.iter().enumerate() {
-            if dq < 0 {
+        for (cpu, &dsq) in cpus.iter().enumerate() {
+            if dsq < 0 {
                 bail!(
-                "Cpu {} not assigned to any dq. Make sure it is covered by some --cpumasks argument.",
-                cpu
-            );
+                    "Cpu {} not assigned to any dsq. Make sure it is covered by some --cpumasks argument.",
+                    cpu
+                );
             }
         }
 
         Ok((cpusets, cpus))
     }
 
-    // Returns Vec of cpuset for each dq and a vec of dq for each cpu
+    // Returns Vec of cpuset for each dsq and a vec of dsq for each cpu
     fn cpusets_from_cache(
         level: u32,
         nr_cpus: usize,
@@ -802,10 +815,10 @@ impl<'a> Scheduler<'a> {
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_LAST_TASK);
 
         info!(
-            "cpu={:6.1} load_avg={:7.1} bal={} task_err={} lb_data_err={} proc={:?}ms",
+            "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms",
             cpu_busy * 100.0,
-            load_avg,
             stats[atropos_sys::stat_idx_ATROPOS_STAT_LOAD_BALANCE as usize],
+            load_avg,
             stats[atropos_sys::stat_idx_ATROPOS_STAT_TASK_GET_ERR as usize],
             self.nr_lb_data_errors,
             processing_dur.as_millis(),
@@ -814,7 +827,7 @@ impl<'a> Scheduler<'a> {
         let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0;
 
         info!(
-            "tot={:6} wsync={:4.1} prev_idle={:4.1} pin={:4.1} dir={:4.1} dq={:4.1} greedy={:4.1}",
+            "tot={:7} wsync={:5.2} prev_idle={:5.2} pin={:5.2} dir={:5.2} dsq={:5.2} greedy={:5.2}",
             total,
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE),
@@ -826,11 +839,8 @@ impl<'a> Scheduler<'a> {
 
         for i in 0..self.nr_doms {
             info!(
-                "DOM[{:02}] load={:7.1} to_pull={:7.1} to_push={:7.1}",
-                i,
-                dom_loads[i],
-                if imbal[i] < 0.0 { -imbal[i] } else { 0.0 },
-                if imbal[i] > 0.0 { imbal[i] } else { 0.0 },
+                "DOM[{:02}] load={:8.2} imbal={:+9.2}",
+                i, dom_loads[i], imbal[i],
             );
         }
     }

From e464b94a424fc14d95468e64fb02aeba8b36a0eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 051/304] SCX: atropos: Clean up stats

Drop unused LAST_TASK and re-group the rest.
---
 tools/sched_ext/atropos/src/bpf/atropos.h | 9 +++++++--
 tools/sched_ext/atropos/src/main.rs       | 3 +--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/atropos/src/bpf/atropos.h
index addf29ca104a5..f2b8214c3c9ec 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.h
+++ b/tools/sched_ext/atropos/src/bpf/atropos.h
@@ -19,15 +19,20 @@
 
 /* Statistics */
 enum stat_idx {
-	ATROPOS_STAT_TASK_GET_ERR,
+	/* The following fields add up to all dispatched tasks */
 	ATROPOS_STAT_WAKE_SYNC,
 	ATROPOS_STAT_PREV_IDLE,
 	ATROPOS_STAT_PINNED,
 	ATROPOS_STAT_DIRECT_DISPATCH,
 	ATROPOS_STAT_DSQ_DISPATCH,
 	ATROPOS_STAT_GREEDY,
+
+	/* Extra stats that don't contribute to total */
 	ATROPOS_STAT_LOAD_BALANCE,
-	ATROPOS_STAT_LAST_TASK,
+
+	/* Errors */
+	ATROPOS_STAT_TASK_GET_ERR,
+
 	ATROPOS_NR_STATS,
 };
 
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index ccc02f72353de..3990d6ec8df30 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -811,8 +811,7 @@ impl<'a> Scheduler<'a> {
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_PINNED)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_LAST_TASK);
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY);
 
         info!(
             "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms",

From 7e7dbae885c59d2d1ad9eb81e577d5a04cde9ade Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 052/304] SCX: atropos: Cleanup the names of task_set_dsq() and
 friends

task_set_dsq() is a bit of misnomer and inconsistent with other functions
including pick_task_domain(). The names are really confusing. Clean up.

* task_set_dsq() -> task_set_domain()
* pick_task_domain() -> task_pick_domain()
* task_set_domain() -> task_pick_and_set_domain()
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 9c31b66758493..cb92826df14e3 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -144,8 +144,8 @@ static inline bool vtime_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
-static bool task_set_dsq(struct task_ctx *task_ctx, struct task_struct *p,
-			 u32 new_dom_id)
+static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
+			    u32 new_dom_id)
 {
 	struct dom_ctx *old_domc, *new_domc;
 	struct bpf_cpumask *d_cpumask, *t_cpumask;
@@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 
 	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
 	if (new_dom && *new_dom != task_ctx->dom_id &&
-	    task_set_dsq(task_ctx, p, *new_dom)) {
+	    task_set_domain(task_ctx, p, *new_dom)) {
 		s32 cpu;
 
 		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
@@ -514,7 +514,7 @@ void BPF_STRUCT_OPS(atropos_set_weight, struct task_struct *p, u32 weight)
 	task_ctx->weight = weight;
 }
 
-struct pick_task_domain_loop_ctx {
+struct task_pick_domain_loop_ctx {
 	struct task_struct *p;
 	const struct cpumask *cpumask;
 	u64 dom_mask;
@@ -522,9 +522,9 @@ struct pick_task_domain_loop_ctx {
 	u32 dom_id;
 };
 
-static int pick_task_domain_loopfn(u32 idx, void *data)
+static int task_pick_domain_loopfn(u32 idx, void *data)
 {
-	struct pick_task_domain_loop_ctx *lctx = data;
+	struct task_pick_domain_loop_ctx *lctx = data;
 	u32 dom_id = (lctx->dom_rr_base + idx) % nr_doms;
 
 	if (dom_id >= MAX_DOMS)
@@ -538,10 +538,10 @@ static int pick_task_domain_loopfn(u32 idx, void *data)
 	return 0;
 }
 
-static u32 pick_task_domain(struct task_ctx *task_ctx, struct task_struct *p,
+static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
 			    const struct cpumask *cpumask)
 {
-	struct pick_task_domain_loop_ctx lctx = {
+	struct task_pick_domain_loop_ctx lctx = {
 		.p = p,
 		.cpumask = cpumask,
 		.dom_id = MAX_DOMS,
@@ -553,21 +553,22 @@ static u32 pick_task_domain(struct task_ctx *task_ctx, struct task_struct *p,
 
 	lctx.dom_rr_base = ++(pcpu_ctx[cpu].dom_rr_cur);
 
-	bpf_loop(nr_doms, pick_task_domain_loopfn, &lctx, 0);
+	bpf_loop(nr_doms, task_pick_domain_loopfn, &lctx, 0);
 	task_ctx->dom_mask = lctx.dom_mask;
 
 	return lctx.dom_id;
 }
 
-static void task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
-			    const struct cpumask *cpumask)
+static void task_pick_and_set_domain(struct task_ctx *task_ctx,
+				     struct task_struct *p,
+				     const struct cpumask *cpumask)
 {
 	u32 dom_id = 0;
 
 	if (nr_doms > 1)
-		dom_id = pick_task_domain(task_ctx, p, cpumask);
+		dom_id = task_pick_domain(task_ctx, p, cpumask);
 
-	if (!task_set_dsq(task_ctx, p, dom_id))
+	if (!task_set_domain(task_ctx, p, dom_id))
 		scx_bpf_error("Failed to set dom%d for %s[%d]",
 			      dom_id, p->comm, p->pid);
 }
@@ -584,7 +585,7 @@ void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
 		return;
 	}
 
-	task_set_domain(task_ctx, p, cpumask);
+	task_pick_and_set_domain(task_ctx, p, cpumask);
 }
 
 s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
@@ -627,7 +628,7 @@ s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
 		return -EINVAL;
 	}
 
-	task_set_domain(map_value, p, p->cpus_ptr);
+	task_pick_and_set_domain(map_value, p, p->cpus_ptr);
 
 	return 0;
 }

From 67c288ec473debe65bcaf2e6bfcbbcebadaed106 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 053/304] SCX: atropos: Replace bpf_loop() usages with the new
 inline loops

---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 74 ++++++++-----------
 1 file changed, 30 insertions(+), 44 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index cb92826df14e3..f99c8cdbf4eed 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -514,49 +514,28 @@ void BPF_STRUCT_OPS(atropos_set_weight, struct task_struct *p, u32 weight)
 	task_ctx->weight = weight;
 }
 
-struct task_pick_domain_loop_ctx {
-	struct task_struct *p;
-	const struct cpumask *cpumask;
-	u64 dom_mask;
-	u32 dom_rr_base;
-	u32 dom_id;
-};
-
-static int task_pick_domain_loopfn(u32 idx, void *data)
-{
-	struct task_pick_domain_loop_ctx *lctx = data;
-	u32 dom_id = (lctx->dom_rr_base + idx) % nr_doms;
-
-	if (dom_id >= MAX_DOMS)
-		return 1;
-
-	if (cpumask_intersects_domain(lctx->cpumask, dom_id)) {
-		lctx->dom_mask |= 1LLU << dom_id;
-		if (lctx->dom_id == MAX_DOMS)
-			lctx->dom_id = dom_id;
-	}
-	return 0;
-}
-
 static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
 			    const struct cpumask *cpumask)
 {
-	struct task_pick_domain_loop_ctx lctx = {
-		.p = p,
-		.cpumask = cpumask,
-		.dom_id = MAX_DOMS,
-	};
 	s32 cpu = bpf_get_smp_processor_id();
+	u32 first_dom = MAX_DOMS, dom;
 
 	if (cpu < 0 || cpu >= MAX_CPUS)
 		return MAX_DOMS;
 
-	lctx.dom_rr_base = ++(pcpu_ctx[cpu].dom_rr_cur);
+	task_ctx->dom_mask = 0;
 
-	bpf_loop(nr_doms, task_pick_domain_loopfn, &lctx, 0);
-	task_ctx->dom_mask = lctx.dom_mask;
+	dom = pcpu_ctx[cpu].dom_rr_cur++;
+	bpf_repeat(nr_doms) {
+		dom = (dom + 1) % nr_doms;
+		if (cpumask_intersects_domain(cpumask, dom)) {
+			task_ctx->dom_mask |= 1LLU << dom;
+			if (first_dom == MAX_DOMS)
+				first_dom = dom;
+		}
+	}
 
-	return lctx.dom_id;
+	return first_dom;
 }
 
 static void task_pick_and_set_domain(struct task_ctx *task_ctx,
@@ -643,36 +622,36 @@ void BPF_STRUCT_OPS(atropos_disable, struct task_struct *p)
 	}
 }
 
-static int create_dom_dsq(u32 idx, void *data)
+static s32 create_dom(u32 dom_id)
 {
 	struct dom_ctx domc_init = {}, *domc;
 	struct bpf_cpumask *cpumask;
-	u32 cpu, dom_id = idx;
+	u32 cpu;
 	s32 ret;
 
 	ret = scx_bpf_create_dsq(dom_id, -1);
 	if (ret < 0) {
 		scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret);
-		return 1;
+		return ret;
 	}
 
 	ret = bpf_map_update_elem(&dom_ctx, &dom_id, &domc_init, 0);
 	if (ret) {
 		scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret);
-		return 1;
+		return ret;
 	}
 
 	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
 	if (!domc) {
 		/* Should never happen, we just inserted it above. */
 		scx_bpf_error("No dom%u", dom_id);
-		return 1;
+		return -ENOENT;
 	}
 
 	cpumask = bpf_cpumask_create();
 	if (!cpumask) {
 		scx_bpf_error("Failed to create BPF cpumask for domain %u", dom_id);
-		return 1;
+		return -ENOMEM;
 	}
 
 	for (cpu = 0; cpu < MAX_CPUS; cpu++) {
@@ -682,7 +661,7 @@ static int create_dom_dsq(u32 idx, void *data)
 		if (!dmask) {
 			scx_bpf_error("array index error");
 			bpf_cpumask_release(cpumask);
-			return 1;
+			return -ENOENT;
 		}
 
 		if (*dmask & (1LLU << (cpu % 64)))
@@ -691,20 +670,27 @@ static int create_dom_dsq(u32 idx, void *data)
 
 	cpumask = bpf_kptr_xchg(&domc->cpumask, cpumask);
 	if (cpumask) {
-		scx_bpf_error("Domain %u was already present", dom_id);
+		scx_bpf_error("Domain %u cpumask already present", dom_id);
 		bpf_cpumask_release(cpumask);
-		return 1;
+		return -EEXIST;
 	}
 
 	return 0;
 }
 
-int BPF_STRUCT_OPS_SLEEPABLE(atropos_init)
+s32 BPF_STRUCT_OPS_SLEEPABLE(atropos_init)
 {
+	struct bpf_cpumask *cpumask;
+	s32 i, ret;
+
 	if (!switch_partial)
 		scx_bpf_switch_all();
 
-	bpf_loop(nr_doms, create_dom_dsq, NULL, 0);
+	bpf_for(i, 0, nr_doms) {
+		ret = create_dom(i);
+		if (ret)
+			return ret;
+	}
 
 	for (u32 i = 0; i < nr_cpus; i++)
 		pcpu_ctx[i].dom_rr_cur = i;

From 1346c5980bf4b7165513474e81a57000e8501b4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 054/304] SCX: atropos: Factor out / relocate utility functions

to prepare for future changes. No functional changes.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 28 +++++++++----------
 tools/sched_ext/atropos/src/main.rs           | 15 ++++++----
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index f99c8cdbf4eed..b8a4ef15df765 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -144,6 +144,20 @@ static inline bool vtime_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
+static u32 cpu_to_dom_id(s32 cpu)
+{
+	const volatile u32 *dom_idp;
+
+	if (nr_doms <= 1)
+		return 0;
+
+	dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]);
+	if (!dom_idp)
+		return MAX_DOMS;
+
+	return *dom_idp;
+}
+
 static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 			    u32 new_dom_id)
 {
@@ -362,20 +376,6 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 }
 
-static u32 cpu_to_dom_id(s32 cpu)
-{
-	const volatile u32 *dom_idp;
-
-	if (nr_doms <= 1)
-		return 0;
-
-	dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]);
-	if (!dom_idp)
-		return MAX_DOMS;
-
-	return *dom_idp;
-}
-
 static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
 {
 	s32 cpu;
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 3990d6ec8df30..907b9c3063481 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -136,6 +136,14 @@ fn clear_map(map: &mut libbpf_rs::Map) {
     }
 }
 
+fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String {
+    cpumask
+        .iter()
+        .take((nr_cpus + 64) / 64)
+        .rev()
+        .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x))
+}
+
 #[derive(Debug)]
 struct TaskLoad {
     runnable_for: u64,
@@ -658,15 +666,10 @@ impl<'a> Scheduler<'a> {
             let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
             let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpuset_slice.len());
             left.clone_from_slice(cpuset.as_raw_slice());
-            let cpumask_str = dom_cpumask_slice
-                .iter()
-                .take((nr_cpus + 63) / 64)
-                .rev()
-                .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x));
             info!(
                 "DOM[{:02}] cpumask{} ({} cpus)",
                 dom,
-                &cpumask_str,
+                &format_cpumask(dom_cpumask_slice, nr_cpus),
                 cpuset.count_ones()
             );
         }

From 0076b4ce8bceed90b72d281c65d40dff33d56259 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 055/304] SCX: Disable ttwu_queue for tasks on SCX

The BPF scheduler may depend on select_task_rq() being invoked during
wakeups and @p may end up executing on a different CPU regardless of what
happens in the wakeup path making the ttwu_queue optimization ineffective.
Skip if on SCX.

Combined with an atropos bug, this caused significant execution stalls in
low to moderate load conditions.
---
 kernel/sched/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c065ffe1e8e6..428f2fa2c954e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3954,6 +3954,16 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
+#ifdef CONFIG_SCHED_CLASS_EXT
+	/*
+	 * The BPF scheduler may depend on select_task_rq() being invoked during
+	 * wakeups and @p may end up executing on a different CPU regardless of
+	 * what happens in the wakeup path making the ttwu_queue optimization
+	 * ineffective. Skip if on SCX.
+	 */
+	if (p->sched_class == &ext_sched_class)
+		return false;
+#endif
 	/*
 	 * Do not complicate things with the async wake_list while the CPU is
 	 * in hotplug state.

From f7aeaae1bd65728a5768e73fb508a1e8a0969743 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 056/304] SCX: atropos: Fix padding size in struct pcpu_ctx

_padding's size is off by 4 bytes. This was okay in practice because struct
alignment would still put it at the same size but let's still fix it.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index b8a4ef15df765..61a761d706e0d 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -74,11 +74,14 @@ const volatile __u64 slice_ns = SCX_SLICE_DFL;
 int exit_type = SCX_EXIT_NONE;
 char exit_msg[SCX_EXIT_MSG_LEN];
 
+/*
+ * Per-CPU context
+ */
 struct pcpu_ctx {
 	__u32 dom_rr_cur; /* used when scanning other doms */
 
 	/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
-	__u8 _padding[CACHELINE_SIZE - sizeof(u64)];
+	__u8 _padding[CACHELINE_SIZE - sizeof(u32)];
 } __attribute__((aligned(CACHELINE_SIZE)));
 
 struct pcpu_ctx pcpu_ctx[MAX_CPUS];

From 736039a67edaf7c8546015cea7fb1b566b3b4476 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 057/304] SCX: atropos: Fix execution bubble caused by
 enqueueing while on a foreign CPU

When not dispatching directly, ->select_cpu() would always put the waking
task on one of the domestic CPUs which guarantees that a CPU will soon try
to dispatch from the DSQ. However, a task can be enqueued outside the
enqueue path and thus without going through ->select_cpu(). If the task was
on a foreign CPU before, this could lead to the task being queued on its
domain's DSQ while none of its CPUs being woken up leading to execution
bubbles.

Fix it by explicitly kicking a domestic CPU if the task being enqueued to a
domain DSQ is on a foreign CPU.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 24 +++++++++++++++++--
 tools/sched_ext/atropos/src/bpf/atropos.h     |  1 +
 tools/sched_ext/atropos/src/main.rs           |  7 +++++-
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 61a761d706e0d..9e29e7942c791 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -318,6 +318,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
 	u32 *new_dom;
+	s32 cpu;
 
 	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
 	    !(p_cpumask = task_ctx->cpumask)) {
@@ -325,11 +326,12 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * Migrate @p to a new domain if requested by userland through lb_data.
+	 */
 	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
 	if (new_dom && *new_dom != task_ctx->dom_id &&
 	    task_set_domain(task_ctx, p, *new_dom)) {
-		s32 cpu;
-
 		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
 
 		/*
@@ -345,6 +347,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, 0);
+		goto dom_queue;
 	}
 
 	if (task_ctx->dispatch_local) {
@@ -353,6 +356,23 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * @p is about to be queued on its domain's dsq. However, @p may be on a
+	 * foreign CPU due to a greedy execution and not have gone through
+	 * ->select_cpu() if it's being enqueued e.g. after slice exhaustion. If
+	 * so, @p would be queued on its domain's dsq but none of the CPUs in
+	 * the domain would be woken up for it which can induce execution
+	 * bubles. Kick a domestic CPU if @p is on a foreign domain.
+	 */
+	if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) {
+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
+		if (cpu < 0)
+			cpu = bpf_cpumask_any((const struct cpumask *)p_cpumask);
+		scx_bpf_kick_cpu(cpu, 0);
+		stat_add(ATROPOS_STAT_REPATRIATE, 1);
+	}
+
+dom_queue:
 	if (fifo_sched) {
 		scx_bpf_dispatch(p, task_ctx->dom_id, slice_ns,
 				 enq_flags);
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/atropos/src/bpf/atropos.h
index f2b8214c3c9ec..3bc71493a027c 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.h
+++ b/tools/sched_ext/atropos/src/bpf/atropos.h
@@ -28,6 +28,7 @@ enum stat_idx {
 	ATROPOS_STAT_GREEDY,
 
 	/* Extra stats that don't contribute to total */
+	ATROPOS_STAT_REPATRIATE,
 	ATROPOS_STAT_LOAD_BALANCE,
 
 	/* Errors */
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 907b9c3063481..c6bf97727c051 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -829,14 +829,19 @@ impl<'a> Scheduler<'a> {
         let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0;
 
         info!(
-            "tot={:7} wsync={:5.2} prev_idle={:5.2} pin={:5.2} dir={:5.2} dsq={:5.2} greedy={:5.2}",
+            "tot={:7} wsync={:5.2} prev_idle={:5.2} pin={:5.2} dir={:5.2}",
             total,
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PINNED),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH),
+        );
+
+        info!(
+            "dsq={:5.2} greedy={:5.2} rep={:5.2}",
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_REPATRIATE),
         );
 
         for i in 0..self.nr_doms {

From 669fd8c3ce472b487323929a527890c66e5c21d1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:54 -1000
Subject: [PATCH 058/304] SCX: atropos: Fix vtime initialization

p->scx.dsq_vtime wasn't being initialized on scheduler load which could put
some tasks far into the future leading to long stalls. Fix it.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 9e29e7942c791..12b3b23e54687 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -162,7 +162,7 @@ static u32 cpu_to_dom_id(s32 cpu)
 }
 
 static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
-			    u32 new_dom_id)
+			    u32 new_dom_id, bool init_dsq_vtime)
 {
 	struct dom_ctx *old_domc, *new_domc;
 	struct bpf_cpumask *d_cpumask, *t_cpumask;
@@ -175,7 +175,10 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 		return false;
 	}
 
-	vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now;
+	if (init_dsq_vtime)
+		vtime_delta = 0;
+	else
+		vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now;
 
 	new_domc = bpf_map_lookup_elem(&dom_ctx, &new_dom_id);
 	if (!new_domc) {
@@ -331,7 +334,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
 	if (new_dom && *new_dom != task_ctx->dom_id &&
-	    task_set_domain(task_ctx, p, *new_dom)) {
+	    task_set_domain(task_ctx, p, *new_dom, false)) {
 		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
 
 		/*
@@ -563,14 +566,15 @@ static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
 
 static void task_pick_and_set_domain(struct task_ctx *task_ctx,
 				     struct task_struct *p,
-				     const struct cpumask *cpumask)
+				     const struct cpumask *cpumask,
+				     bool init_dsq_vtime)
 {
 	u32 dom_id = 0;
 
 	if (nr_doms > 1)
 		dom_id = task_pick_domain(task_ctx, p, cpumask);
 
-	if (!task_set_domain(task_ctx, p, dom_id))
+	if (!task_set_domain(task_ctx, p, dom_id, init_dsq_vtime))
 		scx_bpf_error("Failed to set dom%d for %s[%d]",
 			      dom_id, p->comm, p->pid);
 }
@@ -587,7 +591,7 @@ void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
 		return;
 	}
 
-	task_pick_and_set_domain(task_ctx, p, cpumask);
+	task_pick_and_set_domain(task_ctx, p, cpumask, false);
 }
 
 s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
@@ -630,7 +634,7 @@ s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
 		return -EINVAL;
 	}
 
-	task_pick_and_set_domain(map_value, p, p->cpus_ptr);
+	task_pick_and_set_domain(map_value, p, p->cpus_ptr, true);
 
 	return 0;
 }

From ce0a5f8bc0ea2521bcdf72d85ebdb927b987c6f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:55 -1000
Subject: [PATCH 059/304] SCX: atropos: Add --balanced-kworkers

To avoid stepping over the L3-aware behavior of the new workqueue code which
sets permissive cpus_allowed and uses ->wake_cpu to steer kworkers.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c |  1 +
 tools/sched_ext/atropos/src/bpf/atropos.h     |  1 +
 tools/sched_ext/atropos/src/main.rs           | 28 +++++++++++++++++--
 tools/sched_ext/scx_common.bpf.h              |  1 +
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 12b3b23e54687..57fc885cd280a 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -469,6 +469,7 @@ void BPF_STRUCT_OPS(atropos_runnable, struct task_struct *p, u64 enq_flags)
 	}
 
 	task_ctx->runnable_at = bpf_ktime_get_ns();
+	task_ctx->is_kworker = p->flags & PF_WQ_WORKER;
 }
 
 void BPF_STRUCT_OPS(atropos_running, struct task_struct *p)
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/atropos/src/bpf/atropos.h
index 3bc71493a027c..64cdafb30bcae 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.h
+++ b/tools/sched_ext/atropos/src/bpf/atropos.h
@@ -45,6 +45,7 @@ struct task_ctx {
 	unsigned long long runnable_at;
 	unsigned long long runnable_for;
 	bool dispatch_local;
+	bool is_kworker;
 };
 
 #endif /* __ATROPOS_H */
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index c6bf97727c051..d399b764f3c16 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -93,6 +93,12 @@ struct Opts {
     #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
     kthreads_local: bool,
 
+    /// In recent kernels (>=v6.6), the kernel is responsible for balancing
+    /// kworkers across L3 cache domains. Exclude them from load-balancing
+    /// to avoid conflicting operations. Greedy executions still apply.
+    #[clap(short = 'b', long, action = clap::ArgAction::SetTrue)]
+    balanced_kworkers: bool,
+
     /// Use FIFO scheduling instead of weighted vtime scheduling.
     #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
     fifo_sched: bool,
@@ -155,6 +161,7 @@ struct TaskInfo {
     pid: i32,
     dom_mask: u64,
     migrated: Cell<bool>,
+    is_kworker: bool,
 }
 
 struct LoadBalancer<'a, 'b, 'c> {
@@ -162,6 +169,7 @@ struct LoadBalancer<'a, 'b, 'c> {
     task_loads: &'b mut BTreeMap<i32, TaskLoad>,
     nr_doms: usize,
     load_decay_factor: f64,
+    skip_kworkers: bool,
 
     tasks_by_load: Vec<BTreeMap<OrderedFloat<f64>, TaskInfo>>,
     load_avg: f64,
@@ -184,6 +192,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         task_loads: &'b mut BTreeMap<i32, TaskLoad>,
         nr_doms: usize,
         load_decay_factor: f64,
+        skip_kworkers: bool,
         nr_lb_data_errors: &'c mut u64,
     ) -> Self {
         Self {
@@ -191,6 +200,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
             task_loads,
             nr_doms,
             load_decay_factor,
+            skip_kworkers,
 
             tasks_by_load: (0..nr_doms).map(|_| BTreeMap::<_, _>::new()).collect(),
             load_avg: 0f64,
@@ -279,6 +289,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                         pid,
                         dom_mask: task_ctx.dom_mask,
                         migrated: Cell::new(false),
+                        is_kworker: task_ctx.is_kworker,
                     },
                 );
             }
@@ -307,13 +318,21 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
     // Find the first candidate pid which hasn't already been migrated and
     // can run in @pull_dom.
-    fn find_first_candidate<'d, I>(tasks_by_load: I, pull_dom: u32) -> Option<(f64, &'d TaskInfo)>
+    fn find_first_candidate<'d, I>(
+        tasks_by_load: I,
+        pull_dom: u32,
+        skip_kworkers: bool,
+    ) -> Option<(f64, &'d TaskInfo)>
     where
         I: IntoIterator<Item = (&'d OrderedFloat<f64>, &'d TaskInfo)>,
     {
         match tasks_by_load
             .into_iter()
-            .skip_while(|(_, task)| task.migrated.get() || task.dom_mask & (1 << pull_dom) == 0)
+            .skip_while(|(_, task)| {
+                task.migrated.get()
+                    || (task.dom_mask & (1 << pull_dom) == 0)
+                    || (skip_kworkers && task.is_kworker)
+            })
             .next()
         {
             Some((OrderedFloat(load), task)) => Some((*load, task)),
@@ -356,11 +375,13 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     .range((Unbounded, Included(&OrderedFloat(to_xfer))))
                     .rev(),
                 pull_dom,
+                self.skip_kworkers,
             ),
             Self::find_first_candidate(
                 self.tasks_by_load[push_dom as usize]
                     .range((Included(&OrderedFloat(to_xfer)), Unbounded)),
                 pull_dom,
+                self.skip_kworkers,
             ),
         ) {
             (None, None) => return None,
@@ -482,6 +503,7 @@ struct Scheduler<'a> {
     nr_doms: usize,
     load_decay_factor: f64,
     balance_load: bool,
+    balanced_kworkers: bool,
 
     proc_reader: procfs::ProcReader,
 
@@ -703,6 +725,7 @@ impl<'a> Scheduler<'a> {
             nr_doms,
             load_decay_factor: opts.load_decay_factor.clamp(0.0, 0.99),
             balance_load: !opts.no_load_balance,
+            balanced_kworkers: opts.balanced_kworkers,
 
             proc_reader,
 
@@ -862,6 +885,7 @@ impl<'a> Scheduler<'a> {
             &mut self.task_loads,
             self.nr_doms,
             self.load_decay_factor,
+            self.balanced_kworkers,
             &mut self.nr_lb_data_errors,
         );
 
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 7b82dd1f29752..712f7f7c1b03c 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include "user_exit_info.h"
 
+#define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
 #define PF_KTHREAD			0x00200000	/* I am a kernel thread */
 #define PF_EXITING			0x00000004
 #define CLOCK_MONOTONIC			1

From 0bfa6bf0e066fe7ba61acb8f5cb38b7aea31fbe3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:55 -1000
Subject: [PATCH 060/304] SCX: atropos: Adjust load balance and greedy params

W/ fio on dm-crypt + L3-aware workqueue benchmark, it was observed that
LOAD_IMBAL_REDUCTION_MIN_RATIO can often persist significant load imbalance
for a very long preiod because it rejects a task transfer if the load imbal
reduction from that particular transfer is less than 10%. If there is a
large load imbalance which is caused by combination of many smaller loads,
they'll never get balanced.

However, if we remove LOAD_IMBAL_REDUCTION_MIN_RATIO, the load balancer
becomes prone to oscillations because it tries to balance fully on each
round. Instead, add LOAD_IMBAL_XFER_TARGET_RATIO which dictates the target
transfer amount on each push/pull pair and set it to 50%. ie. Now the load
balancer tries transfer candidate search targeting 50% load imbalance
reduction. However, the final transfer candidate selection is still done
based on the eventual load imbalance reduction so that we can eventually
converge on balanced load.

After the above changes, we always converge to the balanced state (as
defined by LOAD_IMBAL_HIGH_RATIO) but gradually. To ensure work
conservation, let's set the default --greedy-threshold to 1. This does have
negative performance implications but this will be addressed in the future
patches.
---
 tools/sched_ext/atropos/src/main.rs | 33 +++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index d399b764f3c16..61c75868cbf19 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -72,7 +72,7 @@ struct Opts {
     /// cpu will attempt to steal tasks from a domain with at least
     /// greedy_threshold tasks enqueued. These tasks aren't permanently
     /// stolen from the domain.
-    #[clap(short = 'g', long, default_value = "4")]
+    #[clap(short = 'g', long, default_value = "1")]
     greedy_threshold: u32,
 
     /// The load decay factor. Every interval, the existing load is decayed
@@ -183,8 +183,22 @@ struct LoadBalancer<'a, 'b, 'c> {
 }
 
 impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
+    // If imbalance gets higher than this ratio, try to balance the loads.
     const LOAD_IMBAL_HIGH_RATIO: f64 = 0.10;
-    const LOAD_IMBAL_REDUCTION_MIN_RATIO: f64 = 0.1;
+
+    // Aim to transfer this fraction of the imbalance on each round. We want
+    // to be gradual to avoid unnecessary oscillations. While this can delay
+    // convergence, greedy execution should be able to bridge the temporary
+    // gap.
+    const LOAD_IMBAL_XFER_TARGET_RATIO: f64 = 0.50;
+
+    // Don't push out more than this ratio of load on each round. While this
+    // overlaps with XFER_TARGET_RATIO, XFER_TARGET_RATIO only defines the
+    // target and doesn't limit the total load. As long as the transfer
+    // reduces load imbalance between the two involved domains, it'd happily
+    // transfer whatever amount that can be transferred. This limit is used
+    // as the safety cap to avoid draining a given domain too much in a
+    // single round.
     const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50;
 
     fn new(
@@ -345,7 +359,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         (push_dom, to_push): (u32, f64),
         (pull_dom, to_pull): (u32, f64),
     ) -> Option<(&TaskInfo, f64)> {
-        let to_xfer = to_pull.min(to_push);
+        let to_xfer = to_pull.min(to_push) * Self::LOAD_IMBAL_XFER_TARGET_RATIO;
 
         trace!(
             "considering dom {}@{:.2} -> {}@{:.2}",
@@ -364,11 +378,12 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         );
 
         // We want to pick a task to transfer from push_dom to pull_dom to
-        // maximize the reduction of load imbalance between the two. IOW,
-        // pick a task which has the closest load value to $to_xfer that can
-        // be migrated. Find such task by locating the first migratable task
-        // while scanning left from $to_xfer and the counterpart while
-        // scanning right and picking the better of the two.
+        // reduce the load imbalance between the two closest to $to_xfer.
+        // IOW, pick a task which has the closest load value to $to_xfer
+        // that can be migrated. Find such task by locating the first
+        // migratable task while scanning left from $to_xfer and the
+        // counterpart while scanning right and picking the better of the
+        // two.
         let (load, task, new_imbal) = match (
             Self::find_first_candidate(
                 self.tasks_by_load[push_dom as usize]
@@ -401,7 +416,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         // If the best candidate can't reduce the imbalance, there's nothing
         // to do for this pair.
         let old_imbal = to_push + to_pull;
-        if old_imbal * (1.0 - Self::LOAD_IMBAL_REDUCTION_MIN_RATIO) < new_imbal {
+        if old_imbal < new_imbal {
             trace!(
                 "skipping pid {}, dom {} -> {} won't improve imbal {:.2} -> {:.2}",
                 task.pid,

From c65af256e6c6e1ab08700a7f3f81d6e571187377 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:55 -1000
Subject: [PATCH 061/304] SCX: atropos: Replace fb_procfs with MyProcStat

Neither procfs or fb_procfs can determine per-CPU utilization reliably with
CPU hot[un]plugs. Roll our own.

  https://github.com/eminence/procfs/issues/274
  https://github.com/facebookincubator/below/issues/8190
---
 tools/sched_ext/atropos/Cargo.toml  |   1 -
 tools/sched_ext/atropos/src/main.rs | 141 +++++++++++++++-------------
 2 files changed, 78 insertions(+), 64 deletions(-)

diff --git a/tools/sched_ext/atropos/Cargo.toml b/tools/sched_ext/atropos/Cargo.toml
index 1e47d86fc2cf4..a5ab02cb55f86 100644
--- a/tools/sched_ext/atropos/Cargo.toml
+++ b/tools/sched_ext/atropos/Cargo.toml
@@ -11,7 +11,6 @@ anyhow = "1.0.65"
 bitvec = { version = "1.0", features = ["serde"] }
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
-fb_procfs = { git = "https://github.com/facebookincubator/below.git", rev = "f305730"}
 hex = "0.4.3"
 libbpf-rs = "0.19.1"
 libbpf-sys = { version = "1.0.4", features = ["novendor", "static"] }
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 61c75868cbf19..cb690c06e7bf9 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -19,7 +19,6 @@ use std::sync::Arc;
 use std::time::Duration;
 use std::time::SystemTime;
 
-use ::fb_procfs as procfs;
 use anyhow::anyhow;
 use anyhow::bail;
 use anyhow::Context;
@@ -115,14 +114,6 @@ struct Opts {
     verbose: u8,
 }
 
-fn read_total_cpu(reader: &mut procfs::ProcReader) -> Result<procfs::CpuStat> {
-    Ok(reader
-        .read_stat()
-        .context("Failed to read procfs")?
-        .total_cpu
-        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))?)
-}
-
 fn now_monotonic() -> u64 {
     let mut time = libc::timespec {
         tv_sec: 0,
@@ -150,6 +141,80 @@ fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String {
         .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x))
 }
 
+// Neither procfs or fb_procfs can determine per-CPU utilization reliably
+// with CPU hot[un]plugs. Roll our own.
+//
+// https://github.com/eminence/procfs/issues/274
+// https://github.com/facebookincubator/below/issues/8190
+#[derive(Clone, Debug, Default)]
+struct MyCpuStat {
+    user: u64,
+    nice: u64,
+    system: u64,
+    idle: u64,
+    iowait: u64,
+    irq: u64,
+    softirq: u64,
+    steal: u64,
+}
+
+impl MyCpuStat {
+    fn busy_and_total(&self) -> (u64, u64) {
+        let busy = self.user + self.system + self.nice + self.irq + self.softirq + self.steal;
+        (busy, self.idle + busy + self.iowait)
+    }
+
+    fn calc_util(&self, prev: &MyCpuStat) -> f64 {
+        let (curr_busy, curr_total) = self.busy_and_total();
+        let (prev_busy, prev_total) = prev.busy_and_total();
+        let busy = curr_busy - prev_busy;
+        let total = curr_total - prev_total;
+        if total > 0 {
+            ((busy as f64) / (total as f64)).clamp(0.0, 1.0)
+        } else {
+            1.0
+        }
+    }
+}
+
+#[derive(Clone, Debug, Default)]
+struct MyProcStat {
+    total: MyCpuStat,
+    cpus: BTreeMap<usize, MyCpuStat>,
+}
+
+impl MyProcStat {
+    fn read() -> Result<Self> {
+        let mut result: MyProcStat = Default::default();
+        for line in std::fs::read_to_string("/proc/stat")?.lines() {
+            let mut toks = line.split_whitespace();
+
+            let key = toks.next().ok_or(anyhow!("no key"))?;
+            if !key.starts_with("cpu") {
+                break;
+            }
+
+            let cputime = MyCpuStat {
+                user: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                nice: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                system: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                idle: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                iowait: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                irq: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                softirq: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+                steal: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
+            };
+
+            if key.len() == 3 {
+                result.total = cputime;
+            } else {
+                result.cpus.insert(key[3..].parse::<usize>()?, cputime);
+            }
+        }
+        Ok(result)
+    }
+}
+
 #[derive(Debug)]
 struct TaskLoad {
     runnable_for: u64,
@@ -520,10 +585,8 @@ struct Scheduler<'a> {
     balance_load: bool,
     balanced_kworkers: bool,
 
-    proc_reader: procfs::ProcReader,
-
     prev_at: SystemTime,
-    prev_total_cpu: procfs::CpuStat,
+    prev_total_cpu: MyCpuStat,
     task_loads: BTreeMap<i32, TaskLoad>,
 
     nr_lb_data_errors: u64,
@@ -729,8 +792,7 @@ impl<'a> Scheduler<'a> {
         info!("Atropos Scheduler Attached");
 
         // Other stuff.
-        let mut proc_reader = procfs::ProcReader::new();
-        let prev_total_cpu = read_total_cpu(&mut proc_reader)?;
+        let prev_total_cpu = MyProcStat::read()?.total;
 
         Ok(Self {
             skel,
@@ -742,8 +804,6 @@ impl<'a> Scheduler<'a> {
             balance_load: !opts.no_load_balance,
             balanced_kworkers: opts.balanced_kworkers,
 
-            proc_reader,
-
             prev_at: SystemTime::now(),
             prev_total_cpu,
             task_loads: BTreeMap::new(),
@@ -753,53 +813,8 @@ impl<'a> Scheduler<'a> {
     }
 
     fn get_cpu_busy(&mut self) -> Result<f64> {
-        let total_cpu = read_total_cpu(&mut self.proc_reader)?;
-        let busy = match (&self.prev_total_cpu, &total_cpu) {
-            (
-                procfs::CpuStat {
-                    user_usec: Some(prev_user),
-                    nice_usec: Some(prev_nice),
-                    system_usec: Some(prev_system),
-                    idle_usec: Some(prev_idle),
-                    iowait_usec: Some(prev_iowait),
-                    irq_usec: Some(prev_irq),
-                    softirq_usec: Some(prev_softirq),
-                    stolen_usec: Some(prev_stolen),
-                    guest_usec: _,
-                    guest_nice_usec: _,
-                },
-                procfs::CpuStat {
-                    user_usec: Some(curr_user),
-                    nice_usec: Some(curr_nice),
-                    system_usec: Some(curr_system),
-                    idle_usec: Some(curr_idle),
-                    iowait_usec: Some(curr_iowait),
-                    irq_usec: Some(curr_irq),
-                    softirq_usec: Some(curr_softirq),
-                    stolen_usec: Some(curr_stolen),
-                    guest_usec: _,
-                    guest_nice_usec: _,
-                },
-            ) => {
-                let idle_usec = curr_idle - prev_idle;
-                let iowait_usec = curr_iowait - prev_iowait;
-                let user_usec = curr_user - prev_user;
-                let system_usec = curr_system - prev_system;
-                let nice_usec = curr_nice - prev_nice;
-                let irq_usec = curr_irq - prev_irq;
-                let softirq_usec = curr_softirq - prev_softirq;
-                let stolen_usec = curr_stolen - prev_stolen;
-
-                let busy_usec =
-                    user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
-                let total_usec = idle_usec + busy_usec + iowait_usec;
-                busy_usec as f64 / total_usec as f64
-            }
-            _ => {
-                bail!("Some procfs stats are not populated!");
-            }
-        };
-
+        let total_cpu = MyProcStat::read()?.total;
+        let busy = total_cpu.calc_util(&self.prev_total_cpu);
         self.prev_total_cpu = total_cpu;
         Ok(busy)
     }

From 48141b3258c1eaf5f2d0e59f5efc56c0d8c76d05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:55 -1000
Subject: [PATCH 062/304] SCX: atropos: Reorganize Scheduler

To enable adding a mechanism which runs at a different frequency than the
load balancer:

* Refactor scheduling loop into Scheduler::run().

* Refactor topology handling into struct Topology.
---
 tools/sched_ext/atropos/src/main.rs | 371 +++++++++++++++-------------
 1 file changed, 195 insertions(+), 176 deletions(-)

diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index cb690c06e7bf9..19dd4fe6edfe1 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -215,6 +215,157 @@ impl MyProcStat {
     }
 }
 
+#[derive(Debug)]
+struct Topology {
+    nr_cpus: usize,
+    nr_doms: usize,
+    dom_cpus: Vec<BitVec<u64, Lsb0>>,
+    cpu_dom: Vec<i32>,
+}
+
+impl Topology {
+    fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result<Self> {
+        if cpumasks.len() > atropos_sys::MAX_DOMS as usize {
+            bail!(
+                "Number of requested DSQs ({}) is greater than MAX_DOMS ({})",
+                cpumasks.len(),
+                atropos_sys::MAX_DOMS
+            );
+        }
+        let mut cpu_dom = vec![-1i32; nr_cpus];
+        let mut dom_cpus =
+            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
+        for (dsq, cpumask) in cpumasks.iter().enumerate() {
+            let hex_str = {
+                let mut tmp_str = cpumask
+                    .strip_prefix("0x")
+                    .unwrap_or(cpumask)
+                    .replace('_', "");
+                if tmp_str.len() % 2 != 0 {
+                    tmp_str = "0".to_string() + &tmp_str;
+                }
+                tmp_str
+            };
+            let byte_vec = hex::decode(&hex_str)
+                .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?;
+
+            for (index, &val) in byte_vec.iter().rev().enumerate() {
+                let mut v = val;
+                while v != 0 {
+                    let lsb = v.trailing_zeros() as usize;
+                    v &= !(1 << lsb);
+                    let cpu = index * 8 + lsb;
+                    if cpu > nr_cpus {
+                        bail!(
+                            concat!(
+                                "Found cpu ({}) in cpumask ({}) which is larger",
+                                " than the number of cpus on the machine ({})"
+                            ),
+                            cpu,
+                            cpumask,
+                            nr_cpus
+                        );
+                    }
+                    if cpu_dom[cpu] != -1 {
+                        bail!(
+                            "Found cpu ({}) with dsq ({}) but also in cpumask ({})",
+                            cpu,
+                            cpu_dom[cpu],
+                            cpumask
+                        );
+                    }
+                    cpu_dom[cpu] = dsq as i32;
+                    dom_cpus[dsq].set(cpu, true);
+                }
+            }
+            dom_cpus[dsq].set_uninitialized(false);
+        }
+
+        for (cpu, &dsq) in cpu_dom.iter().enumerate() {
+            if dsq < 0 {
+                bail!(
+                    "Cpu {} not assigned to any dsq. Make sure it is covered by some --cpumasks argument.",
+                    cpu
+                );
+            }
+        }
+
+        Ok(Self {
+            nr_cpus,
+            nr_doms: dom_cpus.len(),
+            dom_cpus,
+            cpu_dom,
+        })
+    }
+
+    fn from_cache_level(level: u32, nr_cpus: usize) -> Result<Self> {
+        let mut cpu_to_cache = vec![]; // (cpu_id, cache_id)
+        let mut cache_ids = BTreeSet::<u32>::new();
+        let mut nr_not_found = 0;
+
+        // Build cpu -> cache ID mapping.
+        for cpu in 0..nr_cpus {
+            let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level);
+            let id = match std::fs::read_to_string(&path) {
+                Ok(val) => val
+                    .trim()
+                    .parse::<u32>()
+                    .with_context(|| format!("Failed to parse {:?}'s content {:?}", &path, &val))?,
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    nr_not_found += 1;
+                    0
+                }
+                Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
+            };
+
+            cpu_to_cache.push(id);
+            cache_ids.insert(id);
+        }
+
+        if nr_not_found > 1 {
+            warn!(
+                "Couldn't determine level {} cache IDs for {} CPUs out of {}, assigned to cache ID 0",
+                level, nr_not_found, nr_cpus
+            );
+        }
+
+        // Cache IDs may have holes. Assign consecutive domain IDs to
+        // existing cache IDs.
+        let mut cache_to_dom = BTreeMap::<u32, u32>::new();
+        let mut nr_doms = 0;
+        for cache_id in cache_ids.iter() {
+            cache_to_dom.insert(*cache_id, nr_doms);
+            nr_doms += 1;
+        }
+
+        if nr_doms > atropos_sys::MAX_DOMS {
+            bail!(
+                "Total number of doms {} is greater than MAX_DOMS ({})",
+                nr_doms,
+                atropos_sys::MAX_DOMS
+            );
+        }
+
+        // Build and return dom -> cpumask and cpu -> dom mappings.
+        let mut dom_cpus =
+            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; nr_doms as usize];
+        let mut cpu_dom = vec![];
+
+        for cpu in 0..nr_cpus {
+            let dom_id = cache_to_dom[&cpu_to_cache[cpu]];
+            dom_cpus[dom_id as usize].set(cpu, true);
+            cpu_dom.push(dom_id as i32);
+        }
+
+        Ok(Self {
+            nr_cpus,
+            nr_doms: dom_cpus.len(),
+            dom_cpus,
+            cpu_dom,
+        })
+    }
+}
+
 #[derive(Debug)]
 struct TaskLoad {
     runnable_for: u64,
@@ -231,8 +382,8 @@ struct TaskInfo {
 
 struct LoadBalancer<'a, 'b, 'c> {
     maps: AtroposMapsMut<'a>,
+    top: Arc<Topology>,
     task_loads: &'b mut BTreeMap<i32, TaskLoad>,
-    nr_doms: usize,
     load_decay_factor: f64,
     skip_kworkers: bool,
 
@@ -268,8 +419,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
     fn new(
         maps: AtroposMapsMut<'a>,
+        top: Arc<Topology>,
         task_loads: &'b mut BTreeMap<i32, TaskLoad>,
-        nr_doms: usize,
         load_decay_factor: f64,
         skip_kworkers: bool,
         nr_lb_data_errors: &'c mut u64,
@@ -277,19 +428,20 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         Self {
             maps,
             task_loads,
-            nr_doms,
             load_decay_factor,
             skip_kworkers,
 
-            tasks_by_load: (0..nr_doms).map(|_| BTreeMap::<_, _>::new()).collect(),
+            tasks_by_load: (0..top.nr_doms).map(|_| BTreeMap::<_, _>::new()).collect(),
             load_avg: 0f64,
-            dom_loads: vec![0.0; nr_doms],
+            dom_loads: vec![0.0; top.nr_doms],
 
-            imbal: vec![0.0; nr_doms],
+            imbal: vec![0.0; top.nr_doms],
             doms_to_pull: BTreeMap::new(),
             doms_to_push: BTreeMap::new(),
 
             nr_lb_data_errors,
+
+	    top,
         }
     }
 
@@ -298,7 +450,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         let task_data = self.maps.task_data();
         let mut this_task_loads = BTreeMap::<i32, TaskLoad>::new();
         let mut load_sum = 0.0f64;
-        self.dom_loads = vec![0f64; self.nr_doms];
+        self.dom_loads = vec![0f64; self.top.nr_doms];
 
         for key in task_data.keys() {
             if let Some(task_ctx_vec) = task_data
@@ -374,7 +526,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
             }
         }
 
-        self.load_avg = load_sum / self.nr_doms as f64;
+        self.load_avg = load_sum / self.top.nr_doms as f64;
         *self.task_loads = this_task_loads;
         Ok(())
     }
@@ -579,12 +731,13 @@ struct Scheduler<'a> {
     skel: AtroposSkel<'a>,
     struct_ops: Option<libbpf_rs::Link>,
 
-    nr_cpus: usize,
-    nr_doms: usize,
+    interval: Duration,
     load_decay_factor: f64,
     balance_load: bool,
     balanced_kworkers: bool,
 
+    top: Arc<Topology>,
+
     prev_at: SystemTime,
     prev_total_cpu: MyCpuStat,
     task_loads: BTreeMap<i32, TaskLoad>,
@@ -593,145 +746,6 @@ struct Scheduler<'a> {
 }
 
 impl<'a> Scheduler<'a> {
-    // Returns Vec of cpuset for each dsq and a vec of dsq for each cpu
-    fn parse_cpusets(
-        cpumasks: &[String],
-        nr_cpus: usize,
-    ) -> Result<(Vec<BitVec<u64, Lsb0>>, Vec<i32>)> {
-        if cpumasks.len() > atropos_sys::MAX_DOMS as usize {
-            bail!(
-                "Number of requested DSQs ({}) is greater than MAX_DOMS ({})",
-                cpumasks.len(),
-                atropos_sys::MAX_DOMS
-            );
-        }
-        let mut cpus = vec![-1i32; nr_cpus];
-        let mut cpusets =
-            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
-        for (dsq, cpumask) in cpumasks.iter().enumerate() {
-            let hex_str = {
-                let mut tmp_str = cpumask
-                    .strip_prefix("0x")
-                    .unwrap_or(cpumask)
-                    .replace('_', "");
-                if tmp_str.len() % 2 != 0 {
-                    tmp_str = "0".to_string() + &tmp_str;
-                }
-                tmp_str
-            };
-            let byte_vec = hex::decode(&hex_str)
-                .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?;
-
-            for (index, &val) in byte_vec.iter().rev().enumerate() {
-                let mut v = val;
-                while v != 0 {
-                    let lsb = v.trailing_zeros() as usize;
-                    v &= !(1 << lsb);
-                    let cpu = index * 8 + lsb;
-                    if cpu > nr_cpus {
-                        bail!(
-                            concat!(
-                                "Found cpu ({}) in cpumask ({}) which is larger",
-                                " than the number of cpus on the machine ({})"
-                            ),
-                            cpu,
-                            cpumask,
-                            nr_cpus
-                        );
-                    }
-                    if cpus[cpu] != -1 {
-                        bail!(
-                            "Found cpu ({}) with dsq ({}) but also in cpumask ({})",
-                            cpu,
-                            cpus[cpu],
-                            cpumask
-                        );
-                    }
-                    cpus[cpu] = dsq as i32;
-                    cpusets[dsq].set(cpu, true);
-                }
-            }
-            cpusets[dsq].set_uninitialized(false);
-        }
-
-        for (cpu, &dsq) in cpus.iter().enumerate() {
-            if dsq < 0 {
-                bail!(
-                    "Cpu {} not assigned to any dsq. Make sure it is covered by some --cpumasks argument.",
-                    cpu
-                );
-            }
-        }
-
-        Ok((cpusets, cpus))
-    }
-
-    // Returns Vec of cpuset for each dsq and a vec of dsq for each cpu
-    fn cpusets_from_cache(
-        level: u32,
-        nr_cpus: usize,
-    ) -> Result<(Vec<BitVec<u64, Lsb0>>, Vec<i32>)> {
-        let mut cpu_to_cache = vec![]; // (cpu_id, cache_id)
-        let mut cache_ids = BTreeSet::<u32>::new();
-        let mut nr_not_found = 0;
-
-        // Build cpu -> cache ID mapping.
-        for cpu in 0..nr_cpus {
-            let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level);
-            let id = match std::fs::read_to_string(&path) {
-                Ok(val) => val
-                    .trim()
-                    .parse::<u32>()
-                    .with_context(|| format!("Failed to parse {:?}'s content {:?}", &path, &val))?,
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    nr_not_found += 1;
-                    0
-                }
-                Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
-            };
-
-            cpu_to_cache.push(id);
-            cache_ids.insert(id);
-        }
-
-        if nr_not_found > 1 {
-            warn!(
-                "Couldn't determine level {} cache IDs for {} CPUs out of {}, assigned to cache ID 0",
-                level, nr_not_found, nr_cpus
-            );
-        }
-
-        // Cache IDs may have holes. Assign consecutive domain IDs to
-        // existing cache IDs.
-        let mut cache_to_dom = BTreeMap::<u32, u32>::new();
-        let mut nr_doms = 0;
-        for cache_id in cache_ids.iter() {
-            cache_to_dom.insert(*cache_id, nr_doms);
-            nr_doms += 1;
-        }
-
-        if nr_doms > atropos_sys::MAX_DOMS {
-            bail!(
-                "Total number of doms {} is greater than MAX_DOMS ({})",
-                nr_doms,
-                atropos_sys::MAX_DOMS
-            );
-        }
-
-        // Build and return dom -> cpumask and cpu -> dom mappings.
-        let mut cpusets =
-            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; nr_doms as usize];
-        let mut cpu_to_dom = vec![];
-
-        for cpu in 0..nr_cpus {
-            let dom_id = cache_to_dom[&cpu_to_cache[cpu]];
-            cpusets[dom_id as usize].set(cpu, true);
-            cpu_to_dom.push(dom_id as i32);
-        }
-
-        Ok((cpusets, cpu_to_dom))
-    }
-
     fn init(opts: &Opts) -> Result<Self> {
         // Open the BPF prog first for verification.
         let mut skel_builder = AtroposSkelBuilder::default();
@@ -748,29 +762,29 @@ impl<'a> Scheduler<'a> {
         }
 
         // Initialize skel according to @opts.
-        let (cpusets, cpus) = if opts.cpumasks.len() > 0 {
-            Self::parse_cpusets(&opts.cpumasks, nr_cpus)?
+        let top = Arc::new(if opts.cpumasks.len() > 0 {
+            Topology::from_cpumasks(&opts.cpumasks, nr_cpus)?
         } else {
-            Self::cpusets_from_cache(opts.cache_level, nr_cpus)?
-        };
-        let nr_doms = cpusets.len();
-        skel.rodata().nr_doms = nr_doms as u32;
-        skel.rodata().nr_cpus = nr_cpus as u32;
+            Topology::from_cache_level(opts.cache_level, nr_cpus)?
+        });
+
+        skel.rodata().nr_doms = top.nr_doms as u32;
+        skel.rodata().nr_cpus = top.nr_cpus as u32;
 
-        for (cpu, dom) in cpus.iter().enumerate() {
+        for (cpu, dom) in top.cpu_dom.iter().enumerate() {
             skel.rodata().cpu_dom_id_map[cpu] = *dom as u32;
         }
 
-        for (dom, cpuset) in cpusets.iter().enumerate() {
-            let raw_cpuset_slice = cpuset.as_raw_slice();
+        for (dom, cpus) in top.dom_cpus.iter().enumerate() {
+            let raw_cpus_slice = cpus.as_raw_slice();
             let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
-            let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpuset_slice.len());
-            left.clone_from_slice(cpuset.as_raw_slice());
+            let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len());
+            left.clone_from_slice(cpus.as_raw_slice());
             info!(
                 "DOM[{:02}] cpumask{} ({} cpus)",
                 dom,
                 &format_cpumask(dom_cpumask_slice, nr_cpus),
-                cpuset.count_ones()
+                cpus.count_ones()
             );
         }
 
@@ -798,12 +812,13 @@ impl<'a> Scheduler<'a> {
             skel,
             struct_ops, // should be held to keep it attached
 
-            nr_cpus,
-            nr_doms,
+            interval: Duration::from_secs_f64(opts.interval),
             load_decay_factor: opts.load_decay_factor.clamp(0.0, 0.99),
             balance_load: !opts.no_load_balance,
             balanced_kworkers: opts.balanced_kworkers,
 
+            top,
+
             prev_at: SystemTime::now(),
             prev_total_cpu,
             task_loads: BTreeMap::new(),
@@ -823,7 +838,7 @@ impl<'a> Scheduler<'a> {
         let mut maps = self.skel.maps_mut();
         let stats_map = maps.stats();
         let mut stats: Vec<u64> = Vec::new();
-        let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.nr_cpus];
+        let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.top.nr_cpus];
 
         for stat in 0..atropos_sys::stat_idx_ATROPOS_NR_STATS {
             let cpu_stat_vec = stats_map
@@ -897,7 +912,7 @@ impl<'a> Scheduler<'a> {
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_REPATRIATE),
         );
 
-        for i in 0..self.nr_doms {
+        for i in 0..self.top.nr_doms {
             info!(
                 "DOM[{:02}] load={:8.2} imbal={:+9.2}",
                 i, dom_loads[i], imbal[i],
@@ -905,15 +920,15 @@ impl<'a> Scheduler<'a> {
         }
     }
 
-    fn step(&mut self) -> Result<()> {
+    fn lb_step(&mut self) -> Result<()> {
         let started_at = std::time::SystemTime::now();
         let bpf_stats = self.read_bpf_stats()?;
         let cpu_busy = self.get_cpu_busy()?;
 
         let mut lb = LoadBalancer::new(
             self.skel.maps_mut(),
+            self.top.clone(),
             &mut self.task_loads,
-            self.nr_doms,
             self.load_decay_factor,
             self.balanced_kworkers,
             &mut self.nr_lb_data_errors,
@@ -965,6 +980,15 @@ impl<'a> Scheduler<'a> {
             }
         }
     }
+
+    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
+        while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_type() == 0 {
+            std::thread::sleep(self.interval);
+            self.lb_step()?;
+        }
+
+        self.report_bpf_exit_type()
+    }
 }
 
 impl<'a> Drop for Scheduler<'a> {
@@ -995,6 +1019,8 @@ fn main() -> Result<()> {
         simplelog::ColorChoice::Auto,
     )?;
 
+    let mut sched = Scheduler::init(&opts)?;
+
     let shutdown = Arc::new(AtomicBool::new(false));
     let shutdown_clone = shutdown.clone();
     ctrlc::set_handler(move || {
@@ -1002,12 +1028,5 @@ fn main() -> Result<()> {
     })
     .context("Error setting Ctrl-C handler")?;
 
-    let mut sched = Scheduler::init(&opts)?;
-
-    while !shutdown.load(Ordering::Relaxed) && sched.read_bpf_exit_type() == 0 {
-        std::thread::sleep(Duration::from_secs_f64(opts.interval));
-        sched.step()?;
-    }
-
-    sched.report_bpf_exit_type()
+    sched.run(shutdown)
 }

From 70b2e43062bc1ffc24028d09efc3720e72355986 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:55 -1000
Subject: [PATCH 063/304] SCX: atropos: Improve target CPU selection

Target CPU selection in select_cpu() is more challenging for atropos as
there are tradeoffs between keeping a task local and trying to exploit a
less occupied CPU in a foreign domain. atropos didn't consider CPU
utilizations when making target CPU selections which limited its
effectiveness. This patch improves the situation by:

* Implement a new high-level userland subsystem called Tuner which runs more
  frequently than LoadBalancer (100ms by default) and updates the parameters
  in struct tune_input to modify the BPF scheduler's behavior. Currently,
  Tuner only consider CPU utilizatoin and tells the BPF scheduler whether a
  given CPU is eligible for the new DIRECT_GREEDY and KICK_GREEDY
  optimizations.

* For domestic CPUs, select_cpu() is updated to use the new
  SCX_PICK_IDLE_CPU_WHOLE to prefer wholly-idle CPUs over partially idle
  @prev_cpu.

* If @prev_cpu was in a foreign domain which has enough unused CPU cycles
  and @prev_cpu is wholly idle (all its siblings are idle too), keep
  @prev_cpu by putting the task directly into @prev_cpu's DSQ which is
  counted as DIRECT_GREEDY. Keeping @prev_cpu only when it's wholly idle
  removes the negative performance impact of this mechanism observed in fio
  on kcryptd benchmark.

* Implement similar pattern of preferring wholly idle CPUs and gating
  foreign DIRECT_GREEDY according to the domain's CPU utilization down the
  CPU preference ladder.

* If no direct dispatch path could be taken and if there are any idle CPUs
  in the system, domestic or foreign, kick it after enqueueing to the
  domain. This doesn't seem to have negative performance impact while
  improving work conservation in some cases.

With these optimizations, atropos matches or outperforms CFS at various load
levels when running fio on top of a dm-crypt device.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 253 +++++++++++++++++-
 tools/sched_ext/atropos/src/bpf/atropos.h     |  17 +-
 tools/sched_ext/atropos/src/main.rs           | 174 +++++++++++-
 3 files changed, 419 insertions(+), 25 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 57fc885cd280a..92144786cd9e2 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -91,6 +91,7 @@ struct pcpu_ctx pcpu_ctx[MAX_CPUS];
  */
 struct dom_ctx {
 	struct bpf_cpumask __kptr *cpumask;
+	struct bpf_cpumask __kptr *direct_greedy_cpumask;
 	u64 vtime_now;
 };
 
@@ -142,6 +143,21 @@ struct {
 	__uint(map_flags, 0);
 } lb_data SEC(".maps");
 
+/*
+ * Userspace tuner will frequently update the following struct with tuning
+ * parameters and bump its gen. refresh_tune_params() converts them into forms
+ * that can be used directly in the scheduling paths.
+ */
+struct tune_input{
+	__u64 gen;
+	__u64 direct_greedy_cpumask[MAX_CPUS / 64];
+	__u64 kick_greedy_cpumask[MAX_CPUS / 64];
+} tune_input;
+
+__u64 tune_params_gen;
+private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask;
+private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask;
+
 static inline bool vtime_before(u64 a, u64 b)
 {
 	return (s64)(a - b) < 0;
@@ -161,6 +177,46 @@ static u32 cpu_to_dom_id(s32 cpu)
 	return *dom_idp;
 }
 
+static void refresh_tune_params(void)
+{
+	s32 cpu;
+
+	if (tune_params_gen == tune_input.gen)
+		return;
+
+	tune_params_gen = tune_input.gen;
+
+	bpf_for(cpu, 0, nr_cpus) {
+		u32 dom_id = cpu_to_dom_id(cpu);
+		struct dom_ctx *domc;
+
+		if (!(domc = bpf_map_lookup_elem(&dom_ctx, &dom_id))) {
+			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
+			return;
+		}
+
+		if (tune_input.direct_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
+			if (direct_greedy_cpumask)
+				bpf_cpumask_set_cpu(cpu, direct_greedy_cpumask);
+			if (domc->direct_greedy_cpumask)
+				bpf_cpumask_set_cpu(cpu, domc->direct_greedy_cpumask);
+		} else {
+			if (direct_greedy_cpumask)
+				bpf_cpumask_clear_cpu(cpu, direct_greedy_cpumask);
+			if (domc->direct_greedy_cpumask)
+				bpf_cpumask_clear_cpu(cpu, domc->direct_greedy_cpumask);
+		}
+
+		if (tune_input.kick_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
+			if (kick_greedy_cpumask)
+				bpf_cpumask_set_cpu(cpu, kick_greedy_cpumask);
+		} else {
+			if (kick_greedy_cpumask)
+				bpf_cpumask_clear_cpu(cpu, kick_greedy_cpumask);
+		}
+	}
+}
+
 static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 			    u32 new_dom_id, bool init_dsq_vtime)
 {
@@ -214,14 +270,18 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 	return task_ctx->dom_id == new_dom_id;
 }
 
-s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
-		   u32 wake_flags)
+s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
+		   u64 wake_flags)
 {
 	struct task_ctx *task_ctx;
+	struct cpumask *idle_smtmask;
 	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
+	bool prev_domestic, has_idle_wholes;
 	s32 cpu;
 
+	refresh_tune_params();
+
 	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
 	    !(p_cpumask = task_ctx->cpumask))
 		return -ENOENT;
@@ -277,21 +337,69 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 		}
 	}
 
-	/* if the previous CPU is idle, dispatch directly to it */
-	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-		stat_add(ATROPOS_STAT_PREV_IDLE, 1);
+	/* If only one CPU is allowed, dispatch */
+	if (p->nr_cpus_allowed == 1) {
+		stat_add(ATROPOS_STAT_PINNED, 1);
 		cpu = prev_cpu;
 		goto direct;
 	}
 
-	/* If only one core is allowed, dispatch */
-	if (p->nr_cpus_allowed == 1) {
-		stat_add(ATROPOS_STAT_PINNED, 1);
+	idle_smtmask = scx_bpf_get_idle_smtmask();
+	has_idle_wholes = !bpf_cpumask_empty(idle_smtmask);
+
+	/* did @p get pulled out to a foreign domain by e.g. greedy execution? */
+	prev_domestic = bpf_cpumask_test_cpu(prev_cpu,
+					     (const struct cpumask *)p_cpumask);
+
+	/*
+	 * See if we want to keep @prev_cpu. We want to keep @prev_cpu if the
+	 * whole physical core is idle. If the sibling[s] are busy, it's likely
+	 * more advantageous to look for wholly idle cores first.
+	 */
+	if (prev_domestic) {
+		if (bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			stat_add(ATROPOS_STAT_PREV_IDLE, 1);
+			cpu = prev_cpu;
+			goto direct;
+		}
+	} else {
+		if (direct_greedy_cpumask &&
+		    bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *)
+					 direct_greedy_cpumask) &&
+		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			stat_add(ATROPOS_STAT_GREEDY_IDLE, 1);
+			cpu = prev_cpu;
+			goto direct;
+		}
+	}
+
+	/*
+	 * @prev_cpu didn't work out. Find the best idle domestic CPU.
+	 */
+
+	/* If there is a domestic whole idle CPU, dispatch directly */
+	if (has_idle_wholes) {
+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask,
+					    SCX_PICK_IDLE_CPU_WHOLE);
+		if (cpu >= 0) {
+			stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+			goto direct;
+		}
+	}
+
+	/*
+	 * If @prev_cpu was domestic and is idle itself even though the whole
+	 * core isn't, picking @prev_cpu may improve L1/2 locality.
+	 */
+	if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
 		cpu = prev_cpu;
 		goto direct;
 	}
 
-	/* If there is an eligible idle CPU, dispatch directly */
+	/* If there is any domestic idle CPU, dispatch directly */
 	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
 	if (cpu >= 0) {
 		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
@@ -299,19 +407,86 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
 	}
 
 	/*
-	 * @prev_cpu may be in a different domain. Returning an out-of-domain
-	 * CPU can lead to stalls as all in-domain CPUs may be idle by the time
-	 * @p gets enqueued.
+	 * Domestic domain is fully booked. If there are CPUs which are idle and
+	 * under-utilized, ignore domain boundaries and push the task there. Try
+	 * to find a whole idle CPU first.
+	 */
+	if (task_ctx->all_cpus && direct_greedy_cpumask &&
+	    !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
+		u32 dom_id = cpu_to_dom_id(prev_cpu);
+		struct dom_ctx *domc;
+
+		if (!(domc = bpf_map_lookup_elem(&dom_ctx, &dom_id))) {
+			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
+			scx_bpf_put_idle_cpumask(idle_smtmask);
+			return -ENOENT;
+		}
+
+		/*
+		 * Try to find a whole idle CPU in the previous foreign and then
+		 * any domain.
+		 */
+		if (has_idle_wholes) {
+			if (domc->direct_greedy_cpumask) {
+				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
+							    domc->direct_greedy_cpumask,
+							    SCX_PICK_IDLE_CPU_WHOLE);
+				if (cpu >= 0) {
+					stat_add(ATROPOS_STAT_DIRECT_GREEDY, 1);
+					goto direct;
+				}
+			}
+
+			if (direct_greedy_cpumask) {
+				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
+							    direct_greedy_cpumask,
+							    SCX_PICK_IDLE_CPU_WHOLE);
+				if (cpu >= 0) {
+					stat_add(ATROPOS_STAT_DIRECT_GREEDY_FAR, 1);
+					goto direct;
+				}
+			}
+		}
+
+		/*
+		 * No whole idle CPU. Is there any idle CPU?
+		 */
+		if (domc->direct_greedy_cpumask) {
+			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
+						    domc->direct_greedy_cpumask, 0);
+			if (cpu >= 0) {
+				stat_add(ATROPOS_STAT_DIRECT_GREEDY, 1);
+				goto direct;
+			}
+		}
+
+		if (direct_greedy_cpumask) {
+			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
+						    direct_greedy_cpumask, 0);
+			if (cpu >= 0) {
+				stat_add(ATROPOS_STAT_DIRECT_GREEDY_FAR, 1);
+				goto direct;
+			}
+		}
+	}
+
+	/*
+	 * We're going to queue on the domestic domain's DSQ. @prev_cpu may be
+	 * in a different domain. Returning an out-of-domain CPU can lead to
+	 * stalls as all in-domain CPUs may be idle by the time @p gets
+	 * enqueued.
 	 */
 	if (bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *)p_cpumask))
 		cpu = prev_cpu;
 	else
 		cpu = bpf_cpumask_any((const struct cpumask *)p_cpumask);
 
+	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 
 direct:
 	task_ctx->dispatch_local = true;
+	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 }
 
@@ -400,6 +575,30 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 		scx_bpf_dispatch_vtime(p, task_ctx->dom_id, slice_ns, vtime,
 				       enq_flags);
 	}
+
+	/*
+	 * If there are CPUs which are idle and not saturated, wake them up to
+	 * see whether they'd be able to steal the just queued task. This path
+	 * is taken only if DIRECT_GREEDY didn't trigger in select_cpu().
+	 *
+	 * While both mechanisms serve very similar purposes, DIRECT_GREEDY
+	 * emplaces the task in a foreign CPU directly while KICK_GREEDY just
+	 * wakes up a foreign CPU which will then first try to execute from its
+	 * domestic domain first before snooping foreign ones.
+	 *
+	 * While KICK_GREEDY is a more expensive way of accelerating greedy
+	 * execution, DIRECT_GREEDY shows negative performance impacts when the
+	 * CPUs are highly loaded while KICK_GREEDY doesn't. Even under fairly
+	 * high utilization, KICK_GREEDY can slightly improve work-conservation.
+	 */
+	if (task_ctx->all_cpus && kick_greedy_cpumask) {
+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
+					    kick_greedy_cpumask, 0);
+		if (cpu >= 0) {
+			stat_add(ATROPOS_STAT_KICK_GREEDY, 1);
+			scx_bpf_kick_cpu(cpu, 0);
+		}
+	}
 }
 
 static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
@@ -593,6 +792,7 @@ void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
 	}
 
 	task_pick_and_set_domain(task_ctx, p, cpumask, false);
+	task_ctx->all_cpus = bpf_cpumask_full(cpumask);
 }
 
 s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
@@ -703,6 +903,21 @@ static s32 create_dom(u32 dom_id)
 		return -EEXIST;
 	}
 
+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		scx_bpf_error("Failed to create BPF cpumask for domain %u",
+			      dom_id);
+		return -ENOMEM;
+	}
+
+	cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask);
+	if (cpumask) {
+		scx_bpf_error("Domain %u direct_greedy_cpumask already present",
+			      dom_id);
+		bpf_cpumask_release(cpumask);
+		return -EEXIST;
+	}
+
 	return 0;
 }
 
@@ -723,6 +938,20 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(atropos_init)
 	for (u32 i = 0; i < nr_cpus; i++)
 		pcpu_ctx[i].dom_rr_cur = i;
 
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+	cpumask = bpf_kptr_xchg(&direct_greedy_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+	cpumask = bpf_kptr_xchg(&kick_greedy_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
 	return 0;
 }
 
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/atropos/src/bpf/atropos.h
index 64cdafb30bcae..894782e32fa1e 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.h
+++ b/tools/sched_ext/atropos/src/bpf/atropos.h
@@ -22,13 +22,17 @@ enum stat_idx {
 	/* The following fields add up to all dispatched tasks */
 	ATROPOS_STAT_WAKE_SYNC,
 	ATROPOS_STAT_PREV_IDLE,
+	ATROPOS_STAT_GREEDY_IDLE,
 	ATROPOS_STAT_PINNED,
 	ATROPOS_STAT_DIRECT_DISPATCH,
+	ATROPOS_STAT_DIRECT_GREEDY,
+	ATROPOS_STAT_DIRECT_GREEDY_FAR,
 	ATROPOS_STAT_DSQ_DISPATCH,
 	ATROPOS_STAT_GREEDY,
 
 	/* Extra stats that don't contribute to total */
 	ATROPOS_STAT_REPATRIATE,
+	ATROPOS_STAT_KICK_GREEDY,
 	ATROPOS_STAT_LOAD_BALANCE,
 
 	/* Errors */
@@ -38,14 +42,23 @@ enum stat_idx {
 };
 
 struct task_ctx {
-	unsigned long long dom_mask; /* the domains this task can run on */
+	/* The domains this task can run on */
+	unsigned long long dom_mask;
+
 	struct bpf_cpumask __kptr *cpumask;
 	unsigned int dom_id;
 	unsigned int weight;
 	unsigned long long runnable_at;
 	unsigned long long runnable_for;
-	bool dispatch_local;
+
+	/* The task is a workqueue worker thread */
 	bool is_kworker;
+
+	/* Allowed on all CPUs and eligible for DIRECT_GREEDY optimization */
+	bool all_cpus;
+
+	/* select_cpu() telling enqueue() to queue directly on the DSQ */
+	bool dispatch_local;
 };
 
 #endif /* __ATROPOS_H */
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 19dd4fe6edfe1..4890f405e387c 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -54,6 +54,11 @@ struct Opts {
     #[clap(short = 'i', long, default_value = "2.0")]
     interval: f64,
 
+    /// Tuner runs at higher frequency than the load balancer to dynamically
+    /// tune scheduling behavior. Tuning interval in seconds.
+    #[clap(short = 'I', long, default_value = "0.1")]
+    tune_interval: f64,
+
     /// Build domains according to how CPUs are grouped at this cache level
     /// as determined by /sys/devices/system/cpu/cpuX/cache/indexI/id.
     #[clap(short = 'c', long, default_value = "3")]
@@ -102,6 +107,17 @@ struct Opts {
     #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
     fifo_sched: bool,
 
+    /// Idle CPUs with utilization lower than this will get remote tasks
+    /// directly pushed on them. 0 disables, 100 enables always.
+    #[clap(short = 'D', long, default_value = "90.0")]
+    direct_greedy_under: f64,
+
+    /// Idle CPUs with utilization lower than this may get kicked to
+    /// accelerate stealing when a task is queued on a saturated remote
+    /// domain. 0 disables, 100 enables always.
+    #[clap(short = 'K', long, default_value = "100.0")]
+    kick_greedy_under: f64,
+
     /// If specified, only tasks which have their scheduling policy set to
     /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
     /// tasks are switched.
@@ -366,6 +382,83 @@ impl Topology {
     }
 }
 
+struct Tuner {
+    top: Arc<Topology>,
+    direct_greedy_under: f64,
+    kick_greedy_under: f64,
+    prev_cpu_stats: BTreeMap<usize, MyCpuStat>,
+    dom_utils: Vec<f64>,
+}
+
+impl Tuner {
+    fn new(top: Arc<Topology>, opts: &Opts) -> Result<Self> {
+        Ok(Self {
+            direct_greedy_under: opts.direct_greedy_under / 100.0,
+            kick_greedy_under: opts.kick_greedy_under / 100.0,
+            prev_cpu_stats: MyProcStat::read()?.cpus,
+            dom_utils: vec![0.0; top.nr_doms],
+            top,
+        })
+    }
+
+    fn step(&mut self, skel: &mut AtroposSkel) -> Result<()> {
+        let curr_cpu_stats = MyProcStat::read()?.cpus;
+        let ti = &mut skel.bss().tune_input;
+        let mut dom_nr_cpus = vec![0; self.top.nr_doms];
+        let mut dom_util_sum = vec![0.0; self.top.nr_doms];
+
+        for cpu in 0..self.top.nr_cpus {
+            let dom = self.top.cpu_dom[cpu] as usize;
+
+            // None MyCpuStat indicates offline CPU. Ignore.
+            if let (Some(curr), Some(prev)) =
+                (curr_cpu_stats.get(&cpu), self.prev_cpu_stats.get(&cpu))
+            {
+                dom_nr_cpus[dom] += 1;
+                dom_util_sum[dom] += curr.calc_util(prev);
+            }
+        }
+
+        for dom in 0..self.top.nr_doms {
+            // Calculate the domain avg util. If there are no active CPUs,
+	    // it doesn't really matter. Go with 0.0 as that's less likely
+	    // to confuse users.
+            let util = match dom_nr_cpus[dom] {
+                0 => 0.0,
+                nr => dom_util_sum[dom] / nr as f64,
+            };
+
+            self.dom_utils[dom] = util;
+
+            // This could be implemented better.
+            let update_dom_bits = |target: &mut [u64; 8], val: bool| {
+                for cpu in 0..self.top.nr_cpus {
+                    if self.top.cpu_dom[cpu] as usize == dom {
+                        if val {
+                            target[cpu / 64] |= 1u64 << (cpu % 64);
+                        } else {
+                            target[cpu / 64] &= !(1u64 << (cpu % 64));
+                        }
+                    }
+                }
+            };
+
+            update_dom_bits(
+                &mut ti.direct_greedy_cpumask,
+                self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under,
+            );
+            update_dom_bits(
+                &mut ti.kick_greedy_cpumask,
+                self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under,
+            );
+        }
+
+        ti.gen += 1;
+        self.prev_cpu_stats = curr_cpu_stats;
+        Ok(())
+    }
+}
+
 #[derive(Debug)]
 struct TaskLoad {
     runnable_for: u64,
@@ -441,7 +534,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
             nr_lb_data_errors,
 
-	    top,
+            top,
         }
     }
 
@@ -731,7 +824,8 @@ struct Scheduler<'a> {
     skel: AtroposSkel<'a>,
     struct_ops: Option<libbpf_rs::Link>,
 
-    interval: Duration,
+    sched_interval: Duration,
+    tune_interval: Duration,
     load_decay_factor: f64,
     balance_load: bool,
     balanced_kworkers: bool,
@@ -743,6 +837,8 @@ struct Scheduler<'a> {
     task_loads: BTreeMap<i32, TaskLoad>,
 
     nr_lb_data_errors: u64,
+
+    tuner: Tuner,
 }
 
 impl<'a> Scheduler<'a> {
@@ -812,18 +908,21 @@ impl<'a> Scheduler<'a> {
             skel,
             struct_ops, // should be held to keep it attached
 
-            interval: Duration::from_secs_f64(opts.interval),
+            sched_interval: Duration::from_secs_f64(opts.interval),
+            tune_interval: Duration::from_secs_f64(opts.tune_interval),
             load_decay_factor: opts.load_decay_factor.clamp(0.0, 0.99),
             balance_load: !opts.no_load_balance,
             balanced_kworkers: opts.balanced_kworkers,
 
-            top,
+            top: top.clone(),
 
             prev_at: SystemTime::now(),
             prev_total_cpu,
             task_loads: BTreeMap::new(),
 
             nr_lb_data_errors: 0,
+
+            tuner: Tuner::new(top, opts)?,
         })
     }
 
@@ -868,7 +967,7 @@ impl<'a> Scheduler<'a> {
     }
 
     fn report(
-        &self,
+        &mut self,
         stats: &Vec<u64>,
         cpu_busy: f64,
         processing_dur: Duration,
@@ -879,8 +978,11 @@ impl<'a> Scheduler<'a> {
         let stat = |idx| stats[idx as usize];
         let total = stat(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY_IDLE)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_PINNED)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY)
+            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY_FAR)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH)
             + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY);
 
@@ -897,25 +999,50 @@ impl<'a> Scheduler<'a> {
         let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0;
 
         info!(
-            "tot={:7} wsync={:5.2} prev_idle={:5.2} pin={:5.2} dir={:5.2}",
+            "tot={:7} wsync={:5.2} prev_idle={:5.2} greedy_idle={:5.2} pin={:5.2}",
             total,
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY_IDLE),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PINNED),
+        );
+
+        info!(
+            "dir={:5.2} dir_greedy={:5.2} dir_greedy_far={:5.2}",
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY_FAR),
         );
 
         info!(
-            "dsq={:5.2} greedy={:5.2} rep={:5.2}",
+            "dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}",
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY),
+            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_KICK_GREEDY),
             stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_REPATRIATE),
         );
 
+        let ti = &self.skel.bss().tune_input;
+        info!(
+            "direct_greedy_cpumask={}",
+            format_cpumask(&ti.direct_greedy_cpumask, self.top.nr_cpus)
+        );
+        info!(
+            "  kick_greedy_cpumask={}",
+            format_cpumask(&ti.kick_greedy_cpumask, self.top.nr_cpus)
+        );
+
         for i in 0..self.top.nr_doms {
             info!(
-                "DOM[{:02}] load={:8.2} imbal={:+9.2}",
-                i, dom_loads[i], imbal[i],
+                "DOM[{:02}] util={:6.2} load={:8.2} imbal={}",
+                i,
+                self.tuner.dom_utils[i] * 100.0,
+                dom_loads[i],
+                if imbal[i] == 0.0 {
+                    format!("{:9.2}", 0.0)
+                } else {
+                    format!("{:+9.2}", imbal[i])
+                },
             );
         }
     }
@@ -982,9 +1109,34 @@ impl<'a> Scheduler<'a> {
     }
 
     fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
+        let now = std::time::SystemTime::now();
+        let mut next_tune_at = now + self.tune_interval;
+        let mut next_sched_at = now + self.sched_interval;
+
         while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_type() == 0 {
-            std::thread::sleep(self.interval);
-            self.lb_step()?;
+            let now = std::time::SystemTime::now();
+
+            if now >= next_tune_at {
+                self.tuner.step(&mut self.skel)?;
+                next_tune_at += self.tune_interval;
+                if next_tune_at < now {
+                    next_tune_at = now + self.tune_interval;
+                }
+            }
+
+            if now >= next_sched_at {
+                self.lb_step()?;
+                next_sched_at += self.sched_interval;
+                if next_sched_at < now {
+                    next_sched_at = now + self.sched_interval;
+                }
+            }
+
+            std::thread::sleep(
+                next_sched_at
+                    .min(next_tune_at)
+                    .duration_since(std::time::SystemTime::now())?,
+            );
         }
 
         self.report_bpf_exit_type()

From 4263957abe7fdb96ecc823b887bee17dc85402e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jun 2023 16:03:55 -1000
Subject: [PATCH 064/304] SCX: atropos: Clean up offline CPU handling

If a CPU's cache information can't be determined, atropos falls back to DOM0
after generating a warning. This can be jarring and confusing on many modern
AMD processors which have some cores disabled and report them as possible
but present.

Update the rust side so that it uses Option::None in Topology::cpu_dom to
clearly distinguish the CPUs that are on DOM0 because cache information
couldn't be determined. From the BPF side, this doesn't make any difference;
however, the userland side now can differentiate those CPUs and handle them
appropriate (e.g. not setting them in direct/kick_greedy_cpumask).

As userland can now detect CPUs that have come up after atropos is
initialized and thus doesn't have a proper domain assigned, it now should be
able to possible to reload itself to reflect the new hardware configuration.
This is left for the future for now.
---
 tools/sched_ext/atropos/src/main.rs | 108 +++++++++++++++-------------
 1 file changed, 60 insertions(+), 48 deletions(-)

diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 4890f405e387c..6755bd229a2b9 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -236,22 +236,22 @@ struct Topology {
     nr_cpus: usize,
     nr_doms: usize,
     dom_cpus: Vec<BitVec<u64, Lsb0>>,
-    cpu_dom: Vec<i32>,
+    cpu_dom: Vec<Option<usize>>,
 }
 
 impl Topology {
     fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result<Self> {
         if cpumasks.len() > atropos_sys::MAX_DOMS as usize {
             bail!(
-                "Number of requested DSQs ({}) is greater than MAX_DOMS ({})",
+                "Number of requested domains ({}) is greater than MAX_DOMS ({})",
                 cpumasks.len(),
                 atropos_sys::MAX_DOMS
             );
         }
-        let mut cpu_dom = vec![-1i32; nr_cpus];
+        let mut cpu_dom = vec![None; nr_cpus];
         let mut dom_cpus =
             vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
-        for (dsq, cpumask) in cpumasks.iter().enumerate() {
+        for (dom, cpumask) in cpumasks.iter().enumerate() {
             let hex_str = {
                 let mut tmp_str = cpumask
                     .strip_prefix("0x")
@@ -282,25 +282,25 @@ impl Topology {
                             nr_cpus
                         );
                     }
-                    if cpu_dom[cpu] != -1 {
+                    if let Some(other_dom) = cpu_dom[cpu] {
                         bail!(
-                            "Found cpu ({}) with dsq ({}) but also in cpumask ({})",
+                            "Found cpu ({}) with domain ({}) but also in cpumask ({})",
                             cpu,
-                            cpu_dom[cpu],
+                            other_dom,
                             cpumask
                         );
                     }
-                    cpu_dom[cpu] = dsq as i32;
-                    dom_cpus[dsq].set(cpu, true);
+                    cpu_dom[cpu] = Some(dom);
+                    dom_cpus[dom].set(cpu, true);
                 }
             }
-            dom_cpus[dsq].set_uninitialized(false);
+            dom_cpus[dom].set_uninitialized(false);
         }
 
-        for (cpu, &dsq) in cpu_dom.iter().enumerate() {
-            if dsq < 0 {
+        for (cpu, dom) in cpu_dom.iter().enumerate() {
+            if dom.is_none() {
                 bail!(
-                    "Cpu {} not assigned to any dsq. Make sure it is covered by some --cpumasks argument.",
+                    "CPU {} not assigned to any domain. Make sure it is covered by some --cpumasks argument.",
                     cpu
                 );
             }
@@ -315,46 +315,46 @@ impl Topology {
     }
 
     fn from_cache_level(level: u32, nr_cpus: usize) -> Result<Self> {
-        let mut cpu_to_cache = vec![]; // (cpu_id, cache_id)
-        let mut cache_ids = BTreeSet::<u32>::new();
-        let mut nr_not_found = 0;
+        let mut cpu_to_cache = vec![]; // (cpu_id, Option<cache_id>)
+        let mut cache_ids = BTreeSet::<usize>::new();
+        let mut nr_offline = 0;
 
         // Build cpu -> cache ID mapping.
         for cpu in 0..nr_cpus {
             let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level);
             let id = match std::fs::read_to_string(&path) {
-                Ok(val) => val
-                    .trim()
-                    .parse::<u32>()
-                    .with_context(|| format!("Failed to parse {:?}'s content {:?}", &path, &val))?,
+                Ok(val) => Some(val.trim().parse::<usize>().with_context(|| {
+                    format!("Failed to parse {:?}'s content {:?}", &path, &val)
+                })?),
                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    nr_not_found += 1;
-                    0
+                    nr_offline += 1;
+                    None
                 }
                 Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
             };
 
             cpu_to_cache.push(id);
-            cache_ids.insert(id);
+            if id.is_some() {
+                cache_ids.insert(id.unwrap());
+            }
         }
 
-        if nr_not_found > 1 {
-            warn!(
-                "Couldn't determine level {} cache IDs for {} CPUs out of {}, assigned to cache ID 0",
-                level, nr_not_found, nr_cpus
-            );
-        }
+        info!(
+            "CPUs: online/possible = {}/{}",
+            nr_cpus - nr_offline,
+            nr_cpus
+        );
 
         // Cache IDs may have holes. Assign consecutive domain IDs to
         // existing cache IDs.
-        let mut cache_to_dom = BTreeMap::<u32, u32>::new();
+        let mut cache_to_dom = BTreeMap::<usize, usize>::new();
         let mut nr_doms = 0;
         for cache_id in cache_ids.iter() {
             cache_to_dom.insert(*cache_id, nr_doms);
             nr_doms += 1;
         }
 
-        if nr_doms > atropos_sys::MAX_DOMS {
+        if nr_doms > atropos_sys::MAX_DOMS as usize {
             bail!(
                 "Total number of doms {} is greater than MAX_DOMS ({})",
                 nr_doms,
@@ -368,9 +368,17 @@ impl Topology {
         let mut cpu_dom = vec![];
 
         for cpu in 0..nr_cpus {
-            let dom_id = cache_to_dom[&cpu_to_cache[cpu]];
-            dom_cpus[dom_id as usize].set(cpu, true);
-            cpu_dom.push(dom_id as i32);
+            match cpu_to_cache[cpu] {
+                Some(cache_id) => {
+                    let dom_id = cache_to_dom[&cache_id];
+                    dom_cpus[dom_id].set(cpu, true);
+                    cpu_dom.push(Some(dom_id));
+                }
+                None => {
+                    dom_cpus[0].set(cpu, true);
+                    cpu_dom.push(None);
+                }
+            }
         }
 
         Ok(Self {
@@ -408,12 +416,14 @@ impl Tuner {
         let mut dom_util_sum = vec![0.0; self.top.nr_doms];
 
         for cpu in 0..self.top.nr_cpus {
-            let dom = self.top.cpu_dom[cpu] as usize;
-
-            // None MyCpuStat indicates offline CPU. Ignore.
-            if let (Some(curr), Some(prev)) =
-                (curr_cpu_stats.get(&cpu), self.prev_cpu_stats.get(&cpu))
-            {
+            // None domain indicates the CPU was offline during
+            // initialization and None MyCpuStat indicates the CPU has gone
+            // down since then. Ignore both.
+            if let (Some(dom), Some(curr), Some(prev)) = (
+                self.top.cpu_dom[cpu],
+                curr_cpu_stats.get(&cpu),
+                self.prev_cpu_stats.get(&cpu),
+            ) {
                 dom_nr_cpus[dom] += 1;
                 dom_util_sum[dom] += curr.calc_util(prev);
             }
@@ -421,8 +431,8 @@ impl Tuner {
 
         for dom in 0..self.top.nr_doms {
             // Calculate the domain avg util. If there are no active CPUs,
-	    // it doesn't really matter. Go with 0.0 as that's less likely
-	    // to confuse users.
+            // it doesn't really matter. Go with 0.0 as that's less likely
+            // to confuse users.
             let util = match dom_nr_cpus[dom] {
                 0 => 0.0,
                 nr => dom_util_sum[dom] / nr as f64,
@@ -433,11 +443,13 @@ impl Tuner {
             // This could be implemented better.
             let update_dom_bits = |target: &mut [u64; 8], val: bool| {
                 for cpu in 0..self.top.nr_cpus {
-                    if self.top.cpu_dom[cpu] as usize == dom {
-                        if val {
-                            target[cpu / 64] |= 1u64 << (cpu % 64);
-                        } else {
-                            target[cpu / 64] &= !(1u64 << (cpu % 64));
+                    if let Some(cdom) = self.top.cpu_dom[cpu] {
+                        if cdom == dom {
+                            if val {
+                                target[cpu / 64] |= 1u64 << (cpu % 64);
+                            } else {
+                                target[cpu / 64] &= !(1u64 << (cpu % 64));
+                            }
                         }
                     }
                 }
@@ -868,7 +880,7 @@ impl<'a> Scheduler<'a> {
         skel.rodata().nr_cpus = top.nr_cpus as u32;
 
         for (cpu, dom) in top.cpu_dom.iter().enumerate() {
-            skel.rodata().cpu_dom_id_map[cpu] = *dom as u32;
+            skel.rodata().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32;
         }
 
         for (dom, cpus) in top.dom_cpus.iter().enumerate() {

From 310ed7c9f7c8c07fc69b401d5d935ff18d290272 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jun 2023 10:56:39 -1000
Subject: [PATCH 065/304] SCX: Use sched_smt_active() instead of testing
 sched_smt_present directly

Suggested by David Vernet. This also allows removing explicit #ifdef's.
---
 kernel/sched/ext.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b9cece8558349..2064abaab739f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1933,8 +1933,7 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 	int cpu;
 
 retry:
-#ifdef CONFIG_SCHED_SMT
-	if (static_branch_likely(&sched_smt_present)) {
+	if (sched_smt_active()) {
 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
 		if (cpu < nr_cpu_ids) {
 			const struct cpumask *sbm = topology_sibling_cpumask(cpu);
@@ -1955,7 +1954,7 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 		if (flags & SCX_PICK_IDLE_CPU_WHOLE)
 			return -EBUSY;
 	}
-#endif
+
 	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
 	if (cpu >= nr_cpu_ids)
 		return -EBUSY;
@@ -1992,12 +1991,11 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 	if (p->nr_cpus_allowed == 1)
 		return prev_cpu;
 
-#ifdef CONFIG_SCHED_SMT
 	/*
 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
 	 * partially idle @prev_cpu.
 	 */
-	if (static_branch_likely(&sched_smt_present)) {
+	if (sched_smt_active()) {
 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
 		    test_and_clear_cpu_idle(prev_cpu)) {
 			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
@@ -2010,7 +2008,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 			return cpu;
 		}
 	}
-#endif
 
 	if (test_and_clear_cpu_idle(prev_cpu)) {
 		p->scx.flags |= SCX_TASK_ENQ_LOCAL;

From 0705fe5dc1b9ee9f61009d4f3770633f8eda7fc1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jun 2023 11:45:10 -1000
Subject: [PATCH 066/304] SCX: Move idle_masks.smt clearing into
 test_and_clear_idle_cpu()

Whenever a CPU's idle state is cleared, the matching SMT mask should be
updated accordingly; however, scx_pick_idle_cpu() open coded this logic and
other users of test_and_clear_idle_cpu() wasn't updating idle_masks.smt
correctly. Fix it by moving SMT mask clearing from scx_pick_idle_cpu() to
test_and_clear_idle_cpu().

This issue was noticed by David Vernet.
---
 kernel/sched/ext.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2064abaab739f..4e9365be1a73b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1925,7 +1925,25 @@ void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
 
 static bool test_and_clear_cpu_idle(int cpu)
 {
-	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
+	if (!cpumask_test_and_clear_cpu(cpu, idle_masks.cpu))
+		return false;
+
+	if (sched_smt_active()) {
+		const struct cpumask *sbm = topology_sibling_cpumask(cpu);
+
+		/*
+		 * If offline, @cpu is not its own sibling and
+		 * scx_pick_idle_cpu() can get caught in an infinite loop as
+		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
+		 * is eventually cleared.
+		 */
+		if (cpumask_intersects(sbm, idle_masks.smt))
+			cpumask_andnot(idle_masks.smt, idle_masks.smt, sbm);
+		else if (cpumask_test_cpu(cpu, idle_masks.smt))
+			__cpumask_clear_cpu(cpu, idle_masks.smt);
+	}
+
+	return true;
 }
 
 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
@@ -1935,21 +1953,8 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 retry:
 	if (sched_smt_active()) {
 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
-		if (cpu < nr_cpu_ids) {
-			const struct cpumask *sbm = topology_sibling_cpumask(cpu);
-
-			/*
-			 * If offline, @cpu is not its own sibling and we can
-			 * get caught in an infinite loop as @cpu is never
-			 * cleared from idle_masks.smt. Clear @cpu directly in
-			 * such cases.
-			 */
-			if (likely(cpumask_test_cpu(cpu, sbm)))
-				cpumask_andnot(idle_masks.smt, idle_masks.smt, sbm);
-			else
-				cpumask_andnot(idle_masks.smt, idle_masks.smt, cpumask_of(cpu));
+		if (cpu < nr_cpu_ids)
 			goto found;
-		}
 
 		if (flags & SCX_PICK_IDLE_CPU_WHOLE)
 			return -EBUSY;

From ef949c99103ffb76641701ce6f7b84969606ac91 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jun 2023 16:40:34 -1000
Subject: [PATCH 067/304] SCX: Fix error handling in atropos_select_cpu()

On some error exit paths, atropos_select_cpu() was releasing idle_smtmask
before it was acquired. For some reason, the BPF verifier didn't notice
this. It didn't actually break anything because the release operation is
noop in this specific case.

Fix it by acquiring idle_smtmask at the beginning of function and always
releasing it before returning. While at it, make error return values more
consistent.
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 92144786cd9e2..d5c3b1519606f 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -273,8 +273,8 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
+	struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
 	struct task_ctx *task_ctx;
-	struct cpumask *idle_smtmask;
 	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
 	bool prev_domestic, has_idle_wholes;
@@ -284,7 +284,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
 	    !(p_cpumask = task_ctx->cpumask))
-		return -ENOENT;
+		goto enoent;
 
 	if (kthreads_local &&
 	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
@@ -311,13 +311,13 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			if (!domc) {
 				scx_bpf_error("Failed to find dom%u",
 					      task_ctx->dom_id);
-				return prev_cpu;
+				goto enoent;
 			}
 			d_cpumask = domc->cpumask;
 			if (!d_cpumask) {
 				scx_bpf_error("Failed to acquire dom%u cpumask kptr",
 					      task_ctx->dom_id);
-				return prev_cpu;
+				goto enoent;
 			}
 
 			idle_cpumask = scx_bpf_get_idle_cpumask();
@@ -344,7 +344,6 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 		goto direct;
 	}
 
-	idle_smtmask = scx_bpf_get_idle_smtmask();
 	has_idle_wholes = !bpf_cpumask_empty(idle_smtmask);
 
 	/* did @p get pulled out to a foreign domain by e.g. greedy execution? */
@@ -418,8 +417,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 		if (!(domc = bpf_map_lookup_elem(&dom_ctx, &dom_id))) {
 			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
-			scx_bpf_put_idle_cpumask(idle_smtmask);
-			return -ENOENT;
+			goto enoent;
 		}
 
 		/*
@@ -488,6 +486,10 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	task_ctx->dispatch_local = true;
 	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
+
+enoent:
+	scx_bpf_put_idle_cpumask(idle_smtmask);
+	return -ENOENT;
 }
 
 void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)

From f955a017c997a907665ae0c4730e1fbb7aa0e827 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jun 2023 16:40:34 -1000
Subject: [PATCH 068/304] SCX: Improve comments

---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 19 +++++++++++++++----
 tools/sched_ext/atropos/src/main.rs           |  2 +-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index d5c3b1519606f..07c1dc17a48ab 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -363,6 +363,11 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			goto direct;
 		}
 	} else {
+		/*
+		 * @prev_cpu is foreign. Linger iff the domain isn't too busy as
+		 * indicated by direct_greedy_cpumask. There may also be an idle
+		 * CPU in the domestic domain
+		 */
 		if (direct_greedy_cpumask &&
 		    bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *)
 					 direct_greedy_cpumask) &&
@@ -375,7 +380,9 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	}
 
 	/*
-	 * @prev_cpu didn't work out. Find the best idle domestic CPU.
+	 * @prev_cpu didn't work out. Let's see whether there's an idle CPU @p
+	 * can be directly dispatched to. We'll first try to find the best idle
+	 * domestic CPU and then move onto foreign.
 	 */
 
 	/* If there is a domestic whole idle CPU, dispatch directly */
@@ -474,7 +481,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	 * stalls as all in-domain CPUs may be idle by the time @p gets
 	 * enqueued.
 	 */
-	if (bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *)p_cpumask))
+	if (prev_domestic)
 		cpu = prev_cpu;
 	else
 		cpu = bpf_cpumask_any((const struct cpumask *)p_cpumask);
@@ -541,8 +548,8 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	 * foreign CPU due to a greedy execution and not have gone through
 	 * ->select_cpu() if it's being enqueued e.g. after slice exhaustion. If
 	 * so, @p would be queued on its domain's dsq but none of the CPUs in
-	 * the domain would be woken up for it which can induce execution
-	 * bubles. Kick a domestic CPU if @p is on a foreign domain.
+	 * the domain would be woken up which can induce temporary execution
+	 * stalls. Kick a domestic CPU if @p is on a foreign domain.
 	 */
 	if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) {
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
@@ -758,6 +765,10 @@ static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
 		dom = (dom + 1) % nr_doms;
 		if (cpumask_intersects_domain(cpumask, dom)) {
 			task_ctx->dom_mask |= 1LLU << dom;
+			/*
+			 * AsThe starting point is round-robin'd and the first
+			 * match should be spread across all the domains.
+			 */
 			if (first_dom == MAX_DOMS)
 				first_dom = dom;
 		}
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 6755bd229a2b9..1ee8f11f8dd26 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -41,7 +41,7 @@ use ordered_float::OrderedFloat;
 /// chiplet in a six-chiplet AMD processor, and could match the performance of
 /// production setup using CFS.
 ///
-/// WARNING: Atropos currenlty assumes that all domains have equal
+/// WARNING: Atropos currently assumes that all domains have equal
 /// processing power and at similar distances from each other. This
 /// limitation will be removed in the future.
 #[derive(Debug, Parser)]

From 4748b4e682e4d97879d2979c3f37a12a3207e00c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jun 2023 16:40:34 -1000
Subject: [PATCH 069/304] SCX: atropos: Add scx_bpf_pick_any_cpu() and use it
 in atropos

scx_bpf_pick_idle_cpu() returning -EBUSY doesn't mean that the caller can
assume that there will be future dispatch events because idle tracking may
race with actual scheduler state transitions. The only way to guarantee
timely future dispatch is kicking one of the target CPUs. Document it and
add scx_bpf_pick_any_cpu() which can be used to pick a CPU, idle or not.

atropos is updated to use the new function instead of using cpumask_any()
directly on the allowed CPUs.
---
 kernel/sched/ext.c                            | 40 +++++++++++++++++++
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c |  6 +--
 tools/sched_ext/scx_common.bpf.h              |  1 +
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4e9365be1a73b..c934664927cf1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4117,6 +4117,14 @@ bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
  * number on success. -%EBUSY if no matching cpu was found.
  *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
  * Unavailable if ops.update_idle() is implemented and
  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
  */
@@ -4130,6 +4138,37 @@ s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 	return scx_pick_idle_cpu(cpus_allowed, flags);
 }
 
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to suceed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags)
+{
+	s32 cpu;
+
+	if (static_branch_likely(&scx_builtin_idle_enabled)) {
+		cpu = scx_pick_idle_cpu(cpus_allowed, flags);
+		if (cpu >= 0)
+			return cpu;
+	}
+
+	cpu = cpumask_any_distribute(cpus_allowed);
+	if (cpu < nr_cpu_ids)
+		return cpu;
+	else
+		return -EBUSY;
+}
+
 /**
  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
  * per-CPU cpumask.
@@ -4316,6 +4355,7 @@ BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index 07c1dc17a48ab..e118550fd540a 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -484,7 +484,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (prev_domestic)
 		cpu = prev_cpu;
 	else
-		cpu = bpf_cpumask_any((const struct cpumask *)p_cpumask);
+		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
 
 	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
@@ -552,9 +552,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	 * stalls. Kick a domestic CPU if @p is on a foreign domain.
 	 */
 	if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) {
-		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
-		if (cpu < 0)
-			cpu = bpf_cpumask_any((const struct cpumask *)p_cpumask);
+		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
 		scx_bpf_kick_cpu(cpu, 0);
 		stat_add(ATROPOS_STAT_REPATRIATE, 1);
 	}
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 712f7f7c1b03c..5a6136dceb4d4 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -63,6 +63,7 @@ void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;

From 78447aba9672bbee754bda7b91f6ad6ffe2c8dde Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jun 2023 17:43:09 -1000
Subject: [PATCH 070/304] SCX: atropos: Update kicking during task migration
 across domains

atropos_enqueue() currently kicks the previous domain if the task has
dispatch_local set with the comment explaining that we need to do so because
we own the idle state of the task. This doesn't seem to make sense because
if select_cpu() picked prev_cpu and set dispatch_local, the CPU is already
woken up and enqueue() would be running on that CPU. Whether the task is
dispatched locally or not doesn't the CPU to stall. It will go check all the
queues after this and reassert idle. As there were other stall issues, it
could just be that I was confused. Preliminary testing doesn't show any
problems after removing.

However, on the new domain side, it could miss waking up a CPU when no idle
bits are set which is racy and has some chance of causing a temporary stall.
Fix it using scx_bpf_pick_any_cpu().
---
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index e118550fd540a..c80e8cc79c977 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -520,18 +520,8 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	if (new_dom && *new_dom != task_ctx->dom_id &&
 	    task_set_domain(task_ctx, p, *new_dom, false)) {
 		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
-
-		/*
-		 * If dispatch_local is set, We own @p's idle state but we are
-		 * not gonna put the task in the associated local dsq which can
-		 * cause the CPU to stall. Kick it.
-		 */
-		if (task_ctx->dispatch_local) {
-			task_ctx->dispatch_local = false;
-			scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
-		}
-
-		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
+		task_ctx->dispatch_local = false;
+		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, 0);
 		goto dom_queue;

From 52db350b405153d1bc5cbc619060d4fc360b230e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 21 Jun 2023 11:05:24 -0500
Subject: [PATCH 071/304] atropos: Use Instant instead of SystemTime

Atropos currently uses the std::time::SystemTime object to time when
load balance steps, etc should occur. As described in [0],
SystemTime::duration_since() can throw an error if the @earlier field is
in fact later than self. This can occur randomly even with correct code,
as according to the docs:

> This function may fail because measurements taken earlier are not
> guaranteed to always be before later measurements (due to anomalies
> such as the system clock being adjusted either forwards or backwards).

[0]: https://doc.rust-lang.org/std/time/struct.SystemTime.html#method.duration_since

The solution is to instead use std::time::Instant, which is monotonic as
described in [1].

[1]: https://doc.rust-lang.org/std/time/struct.Instant.html#method.duration_since

Without this patch, atropos will error out after ~1 minute of running
rcutorture. With the patch, it is able to run (seemingly) indefinitely.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/atropos/src/main.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
index 1ee8f11f8dd26..6d8ea6f4ef3c0 100644
--- a/tools/sched_ext/atropos/src/main.rs
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -17,7 +17,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::time::Duration;
-use std::time::SystemTime;
+use std::time::Instant;
 
 use anyhow::anyhow;
 use anyhow::bail;
@@ -844,7 +844,7 @@ struct Scheduler<'a> {
 
     top: Arc<Topology>,
 
-    prev_at: SystemTime,
+    prev_at: Instant,
     prev_total_cpu: MyCpuStat,
     task_loads: BTreeMap<i32, TaskLoad>,
 
@@ -928,7 +928,7 @@ impl<'a> Scheduler<'a> {
 
             top: top.clone(),
 
-            prev_at: SystemTime::now(),
+            prev_at: Instant::now(),
             prev_total_cpu,
             task_loads: BTreeMap::new(),
 
@@ -1060,7 +1060,7 @@ impl<'a> Scheduler<'a> {
     }
 
     fn lb_step(&mut self) -> Result<()> {
-        let started_at = std::time::SystemTime::now();
+        let started_at = Instant::now();
         let bpf_stats = self.read_bpf_stats()?;
         let cpu_busy = self.get_cpu_busy()?;
 
@@ -1073,7 +1073,7 @@ impl<'a> Scheduler<'a> {
             &mut self.nr_lb_data_errors,
         );
 
-        lb.read_task_loads(started_at.duration_since(self.prev_at)?)?;
+        lb.read_task_loads(started_at.duration_since(self.prev_at))?;
         lb.calculate_dom_load_balance()?;
 
         if self.balance_load {
@@ -1087,7 +1087,7 @@ impl<'a> Scheduler<'a> {
         self.report(
             &bpf_stats,
             cpu_busy,
-            std::time::SystemTime::now().duration_since(started_at)?,
+            Instant::now().duration_since(started_at),
             load_avg,
             &dom_loads,
             &imbal,
@@ -1121,12 +1121,12 @@ impl<'a> Scheduler<'a> {
     }
 
     fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
-        let now = std::time::SystemTime::now();
+        let now = Instant::now();
         let mut next_tune_at = now + self.tune_interval;
         let mut next_sched_at = now + self.sched_interval;
 
         while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_type() == 0 {
-            let now = std::time::SystemTime::now();
+            let now = Instant::now();
 
             if now >= next_tune_at {
                 self.tuner.step(&mut self.skel)?;
@@ -1147,7 +1147,7 @@ impl<'a> Scheduler<'a> {
             std::thread::sleep(
                 next_sched_at
                     .min(next_tune_at)
-                    .duration_since(std::time::SystemTime::now())?,
+                    .duration_since(Instant::now()),
             );
         }
 

From 9adb3cd92f24365802535c7e2468766261c952a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 21 Jun 2023 08:37:27 -1000
Subject: [PATCH 072/304] SCX: test_and_clear_cpu_idle() should always clear
 SMT mask

test_and_clear_cpu_idle() was clearing SMT mask iff the specified CPU was
idle. This is incorrect as the SMT cluster obviously not idle if CPU was
busy and can lead to scx_pick_idle_cpu() loop infinitely trying to claim the
same CPU from SMT mask which never gets cleared.
---
 kernel/sched/ext.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c934664927cf1..d689257c68c20 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1925,9 +1925,11 @@ void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
 
 static bool test_and_clear_cpu_idle(int cpu)
 {
-	if (!cpumask_test_and_clear_cpu(cpu, idle_masks.cpu))
-		return false;
-
+	/*
+	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+	 * cluster is not wholly idle either way. This also prevents
+	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
+	 */
 	if (sched_smt_active()) {
 		const struct cpumask *sbm = topology_sibling_cpumask(cpu);
 
@@ -1943,7 +1945,7 @@ static bool test_and_clear_cpu_idle(int cpu)
 			__cpumask_clear_cpu(cpu, idle_masks.smt);
 	}
 
-	return true;
+	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
 }
 
 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)

From dec3eaaab0db6b0ae04d18355e4e958e8f37446e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 21 Jun 2023 16:33:23 -1000
Subject: [PATCH 073/304] SCX: s/task_on_scx()/task_should_scx()/ and add new
 task_on_scx() which tests current state

Contrary to the name, task_on_scx() determined whether a task should be
switched to SCX not whether it's currently on it. This is a bit confusing
and makes it tricky to name a function which actually tests whether a task
is currently on ext_sched_class.

* Rename task_on_scx() to task_should_scx().

* Add task_on_scx() which tests whether the specified task is currently on
  ext_sched_class and replace explicit tests in kernel/sched/core.c and
  debug.c.
---
 kernel/sched/core.c  | 19 ++++++++-----------
 kernel/sched/debug.c |  2 +-
 kernel/sched/ext.c   |  2 +-
 kernel/sched/ext.h   |  8 +++++++-
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 428f2fa2c954e..3ff57b0b2bc84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -167,10 +167,8 @@ static inline int __task_prio(const struct task_struct *p)
 	if (p->sched_class == &idle_sched_class)
 		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
 
-#ifdef CONFIG_SCHED_CLASS_EXT
-	if (p->sched_class == &ext_sched_class)
+	if (task_on_scx(p))
 		return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
-#endif
 
 	return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
 }
@@ -3954,16 +3952,15 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
-#ifdef CONFIG_SCHED_CLASS_EXT
 	/*
 	 * The BPF scheduler may depend on select_task_rq() being invoked during
-	 * wakeups and @p may end up executing on a different CPU regardless of
-	 * what happens in the wakeup path making the ttwu_queue optimization
-	 * ineffective. Skip if on SCX.
+	 * wakeups. In addition, @p may end up executing on a different CPU
+	 * regardless of what happens in the wakeup path making the ttwu_queue
+	 * optimization less meaningful. Skip if on SCX.
 	 */
-	if (p->sched_class == &ext_sched_class)
+	if (task_on_scx(p))
 		return false;
-#endif
+
 	/*
 	 * Do not complicate things with the async wake_list while the CPU is
 	 * in hotplug state.
@@ -4799,7 +4796,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	} else if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
 #ifdef CONFIG_SCHED_CLASS_EXT
-	} else if (task_on_scx(p)) {
+	} else if (task_should_scx(p)) {
 		p->sched_class = &ext_sched_class;
 #endif
 	} else {
@@ -7096,7 +7093,7 @@ void __setscheduler_prio(struct task_struct *p, int prio)
 	else if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
 #ifdef CONFIG_SCHED_CLASS_EXT
-	else if (task_on_scx(p))
+	else if (task_should_scx(p))
 		p->sched_class = &ext_sched_class;
 #endif
 	else
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6eaa63cf0e20b..79fac9c92a228 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1097,7 +1097,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P(dl.deadline);
 	}
 #ifdef CONFIG_SCHED_CLASS_EXT
-	__PS("ext.enabled", p->sched_class == &ext_sched_class);
+	__PS("ext.enabled", task_on_scx(p));
 #endif
 #undef PN_SCHEDSTAT
 #undef P_SCHEDSTAT
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d689257c68c20..15ae7b32e1681 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2867,7 +2867,7 @@ static void scx_cgroup_config_knobs(void) {}
  * Used by sched_fork() and __setscheduler_prio() to pick the matching
  * sched_class. dl/rt are already handled.
  */
-bool task_on_scx(struct task_struct *p)
+bool task_should_scx(struct task_struct *p)
 {
 	if (!scx_enabled() || scx_ops_disabling())
 		return false;
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 7bc5e871f7fd1..08d7925075089 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -110,7 +110,12 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 
 DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 
-bool task_on_scx(struct task_struct *p);
+static inline bool task_on_scx(struct task_struct *p)
+{
+	return scx_enabled() && p->sched_class == &ext_sched_class;
+}
+
+bool task_should_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
@@ -198,6 +203,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 #define scx_enabled()		false
 #define scx_switched_all()	false
 
+static inline bool task_on_scx(struct task_struct *p) { return false; }
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}

From 7404b6cbce8dc420b61a81c1e4005d8de06264cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jun 2023 10:20:47 -1000
Subject: [PATCH 074/304] SCX: Typo fix

---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 15ae7b32e1681..52f5ebb402c94 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4146,7 +4146,7 @@ s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
  * @flags: %SCX_PICK_IDLE_CPU_* flags
  *
  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
- * CPU in @cpus_allowed. Guaranteed to suceed and returns the picked idle cpu
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
  * empty.
  *

From cb4315d22aaf22e138f4e97c748f6f0c99dd9713 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jun 2023 10:59:36 -1000
Subject: [PATCH 075/304] SCX: Use "idle core" instead of "whole cpu"

While "core" is a heavily overloaded word, it's what used inside the kernel
to describe a whole physical SMT group. Let's use that in SCX and atropos
too.
---
 kernel/sched/ext.c                            |  4 +--
 kernel/sched/ext.h                            |  2 +-
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 29 +++++++++----------
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 52f5ebb402c94..6b762bf2fa51c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1958,7 +1958,7 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 		if (cpu < nr_cpu_ids)
 			goto found;
 
-		if (flags & SCX_PICK_IDLE_CPU_WHOLE)
+		if (flags & SCX_PICK_IDLE_CORE)
 			return -EBUSY;
 	}
 
@@ -2009,7 +2009,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 			return prev_cpu;
 		}
 
-		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CPU_WHOLE);
+		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
 			return cpu;
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 08d7925075089..7e2900b8f6f21 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -92,7 +92,7 @@ enum scx_kick_flags {
 };
 
 enum scx_pick_idle_cpu_flags {
-	SCX_PICK_IDLE_CPU_WHOLE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
+	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
index c80e8cc79c977..118fe728e886d 100644
--- a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -277,7 +277,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	struct task_ctx *task_ctx;
 	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
-	bool prev_domestic, has_idle_wholes;
+	bool prev_domestic, has_idle_cores;
 	s32 cpu;
 
 	refresh_tune_params();
@@ -344,7 +344,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 		goto direct;
 	}
 
-	has_idle_wholes = !bpf_cpumask_empty(idle_smtmask);
+	has_idle_cores = !bpf_cpumask_empty(idle_smtmask);
 
 	/* did @p get pulled out to a foreign domain by e.g. greedy execution? */
 	prev_domestic = bpf_cpumask_test_cpu(prev_cpu,
@@ -385,10 +385,10 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	 * domestic CPU and then move onto foreign.
 	 */
 
-	/* If there is a domestic whole idle CPU, dispatch directly */
-	if (has_idle_wholes) {
+	/* If there is a domestic idle core, dispatch directly */
+	if (has_idle_cores) {
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask,
-					    SCX_PICK_IDLE_CPU_WHOLE);
+					    SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
 			goto direct;
@@ -396,8 +396,8 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	}
 
 	/*
-	 * If @prev_cpu was domestic and is idle itself even though the whole
-	 * core isn't, picking @prev_cpu may improve L1/2 locality.
+	 * If @prev_cpu was domestic and is idle itself even though the core
+	 * isn't, picking @prev_cpu may improve L1/2 locality.
 	 */
 	if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
 		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
@@ -415,7 +415,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	/*
 	 * Domestic domain is fully booked. If there are CPUs which are idle and
 	 * under-utilized, ignore domain boundaries and push the task there. Try
-	 * to find a whole idle CPU first.
+	 * to find an idle core first.
 	 */
 	if (task_ctx->all_cpus && direct_greedy_cpumask &&
 	    !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
@@ -427,15 +427,12 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			goto enoent;
 		}
 
-		/*
-		 * Try to find a whole idle CPU in the previous foreign and then
-		 * any domain.
-		 */
-		if (has_idle_wholes) {
+		/* Try to find an idle core in the previous and then any domain */
+		if (has_idle_cores) {
 			if (domc->direct_greedy_cpumask) {
 				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
 							    domc->direct_greedy_cpumask,
-							    SCX_PICK_IDLE_CPU_WHOLE);
+							    SCX_PICK_IDLE_CORE);
 				if (cpu >= 0) {
 					stat_add(ATROPOS_STAT_DIRECT_GREEDY, 1);
 					goto direct;
@@ -445,7 +442,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			if (direct_greedy_cpumask) {
 				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
 							    direct_greedy_cpumask,
-							    SCX_PICK_IDLE_CPU_WHOLE);
+							    SCX_PICK_IDLE_CORE);
 				if (cpu >= 0) {
 					stat_add(ATROPOS_STAT_DIRECT_GREEDY_FAR, 1);
 					goto direct;
@@ -454,7 +451,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 		}
 
 		/*
-		 * No whole idle CPU. Is there any idle CPU?
+		 * No idle core. Is there any idle CPU?
 		 */
 		if (domc->direct_greedy_cpumask) {
 			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)

From 6e866b328a97529f535f908bdfb4bc1e66b0bfa7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jun 2023 15:00:17 -1000
Subject: [PATCH 076/304] SCX: Drop _example prefix from scheduler names and
 make the naming more consistent

The idea was to signify that the scheduler isn't ready for any kind of
production use with the _example prefix. The distinction, however, isn't
clear cut and changes over time. We already have README and help message
detailing the behavior characteristics. Let's drop the prefix.

While at it, make file and directory names more consistent too:

* s/scx_example_userland_common.h/scx_userland.h/
* s/atropos/scx_atropos/
---
 Documentation/scheduler/sched-ext.rst         | 17 +++++----
 tools/sched_ext/.gitignore                    | 12 +++----
 tools/sched_ext/Makefile                      | 33 ++++++++---------
 tools/sched_ext/README                        | 36 +++++++++----------
 .../{atropos => scx_atropos}/.gitignore       |  0
 .../{atropos => scx_atropos}/Cargo.toml       |  0
 .../{atropos => scx_atropos}/build.rs         |  0
 .../{atropos => scx_atropos}/rustfmt.toml     |  0
 .../src/atropos_sys.rs                        |  0
 .../src/bpf/atropos.bpf.c                     |  0
 .../src/bpf/atropos.h                         |  0
 .../{atropos => scx_atropos}/src/main.rs      |  0
 ...xample_central.bpf.c => scx_central.bpf.c} |  0
 .../{scx_example_central.c => scx_central.c}  | 10 +++---
 ..._example_flatcg.bpf.c => scx_flatcg.bpf.c} |  2 +-
 .../{scx_example_flatcg.c => scx_flatcg.c}    | 14 ++++----
 .../{scx_example_flatcg.h => scx_flatcg.h}    |  0
 ...{scx_example_pair.bpf.c => scx_pair.bpf.c} |  2 +-
 .../{scx_example_pair.c => scx_pair.c}        | 12 +++----
 .../{scx_example_pair.h => scx_pair.h}        |  0
 ...{scx_example_qmap.bpf.c => scx_qmap.bpf.c} |  0
 .../{scx_example_qmap.c => scx_qmap.c}        | 10 +++---
 ..._example_simple.bpf.c => scx_simple.bpf.c} |  0
 .../{scx_example_simple.c => scx_simple.c}    | 12 +++----
 ...mple_userland.bpf.c => scx_userland.bpf.c} |  2 +-
 ...{scx_example_userland.c => scx_userland.c} | 14 ++++----
 ...ample_userland_common.h => scx_userland.h} |  0
 27 files changed, 86 insertions(+), 90 deletions(-)
 rename tools/sched_ext/{atropos => scx_atropos}/.gitignore (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/Cargo.toml (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/build.rs (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/rustfmt.toml (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/src/atropos_sys.rs (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/src/bpf/atropos.bpf.c (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/src/bpf/atropos.h (100%)
 rename tools/sched_ext/{atropos => scx_atropos}/src/main.rs (100%)
 rename tools/sched_ext/{scx_example_central.bpf.c => scx_central.bpf.c} (100%)
 rename tools/sched_ext/{scx_example_central.c => scx_central.c} (91%)
 rename tools/sched_ext/{scx_example_flatcg.bpf.c => scx_flatcg.bpf.c} (99%)
 rename tools/sched_ext/{scx_example_flatcg.c => scx_flatcg.c} (95%)
 rename tools/sched_ext/{scx_example_flatcg.h => scx_flatcg.h} (100%)
 rename tools/sched_ext/{scx_example_pair.bpf.c => scx_pair.bpf.c} (99%)
 rename tools/sched_ext/{scx_example_pair.c => scx_pair.c} (94%)
 rename tools/sched_ext/{scx_example_pair.h => scx_pair.h} (100%)
 rename tools/sched_ext/{scx_example_qmap.bpf.c => scx_qmap.bpf.c} (100%)
 rename tools/sched_ext/{scx_example_qmap.c => scx_qmap.c} (94%)
 rename tools/sched_ext/{scx_example_simple.bpf.c => scx_simple.bpf.c} (100%)
 rename tools/sched_ext/{scx_example_simple.c => scx_simple.c} (88%)
 rename tools/sched_ext/{scx_example_userland.bpf.c => scx_userland.bpf.c} (99%)
 rename tools/sched_ext/{scx_example_userland.c => scx_userland.c} (97%)
 rename tools/sched_ext/{scx_example_userland_common.h => scx_userland.h} (100%)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 2ef2f409f4a66..25ddb535c2972 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -43,7 +43,7 @@ BPF scheduler and reverts all tasks back to CFS.
 .. code-block:: none
 
     # make -j16 -C tools/sched_ext
-    # tools/sched_ext/scx_example_simple
+    # tools/sched_ext/scx_simple
     local=0 global=3
     local=5 global=24
     local=9 global=44
@@ -73,8 +73,7 @@ Userspace can implement an arbitrary BPF scheduler by loading a set of BPF
 programs that implement ``struct sched_ext_ops``. The only mandatory field
 is ``ops.name`` which must be a valid BPF object name. All operations are
 optional. The following modified excerpt is from
-``tools/sched/scx_example_simple.bpf.c`` showing a minimal global FIFO
-scheduler.
+``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler.
 
 .. code-block:: c
 
@@ -196,8 +195,8 @@ DSQs are consumed automatically.
 
 ``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
 ``scx_bpf_dispatch_vtime()`` for the priority queue. See the function
-documentation and usage in ``tools/sched_ext/scx_example_simple.bpf.c`` for
-more information.
+documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for more
+information.
 
 Where to Look
 =============
@@ -211,11 +210,11 @@ Where to Look
 
 * ``tools/sched_ext/`` hosts example BPF scheduler implementations.
 
-  * ``scx_example_simple[.bpf].c``: Minimal global FIFO scheduler example
-    using a custom DSQ.
+  * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
+    custom DSQ.
 
-  * ``scx_example_qmap[.bpf].c``: A multi-level FIFO scheduler supporting
-    five levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
+  * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
+    levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
 
 ABI Instability
 ===============
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index a3240f9f7ebae..c63ee5e4f4bb0 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -1,9 +1,9 @@
-scx_example_simple
-scx_example_qmap
-scx_example_central
-scx_example_pair
-scx_example_flatcg
-scx_example_userland
+scx_simple
+scx_qmap
+scx_central
+scx_pair
+scx_flatcg
+scx_userland
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 73c43782837d4..1515ff9cce7f9 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -117,8 +117,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_example_simple scx_example_qmap scx_example_central scx_example_pair	\
-     scx_example_flatcg scx_example_userland atropos
+all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_atropos
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -169,45 +168,43 @@ endif
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $@
 	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $(@:.skel.h=.subskel.h)
 
-scx_example_simple: scx_example_simple.c scx_example_simple.skel.h user_exit_info.h
+scx_simple: scx_simple.c scx_simple.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_example_qmap: scx_example_qmap.c scx_example_qmap.skel.h user_exit_info.h
+scx_qmap: scx_qmap.c scx_qmap.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_example_central: scx_example_central.c scx_example_central.skel.h user_exit_info.h
+scx_central: scx_central.c scx_central.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_example_pair: scx_example_pair.c scx_example_pair.skel.h user_exit_info.h
+scx_pair: scx_pair.c scx_pair.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_example_flatcg: scx_example_flatcg.c scx_example_flatcg.skel.h user_exit_info.h
+scx_flatcg: scx_flatcg.c scx_flatcg.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_example_userland: scx_example_userland.c scx_example_userland.skel.h	\
-		      scx_example_userland_common.h user_exit_info.h
+scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-atropos: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
-atropos: export ATROPOS_CLANG = $(CLANG)
-atropos: export ATROPOS_BPF_CFLAGS = $(BPF_CFLAGS)
-atropos: $(INCLUDE_DIR)/vmlinux.h
-	cargo build --manifest-path=atropos/Cargo.toml $(CARGOFLAGS)
+scx_atropos: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
+scx_atropos: export ATROPOS_CLANG = $(CLANG)
+scx_atropos: export ATROPOS_BPF_CFLAGS = $(BPF_CFLAGS)
+scx_atropos: $(INCLUDE_DIR)/vmlinux.h
+	cargo build --manifest-path=scx_atropos/Cargo.toml $(CARGOFLAGS)
 
 clean:
-	cargo clean --manifest-path=atropos/Cargo.toml
+	cargo clean --manifest-path=scx_atropos/Cargo.toml
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-	rm -f scx_example_simple scx_example_qmap scx_example_central		\
-	      scx_example_pair scx_example_flatcg scx_example_userland
+	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
 
-.PHONY: all atropos clean
+.PHONY: all scx_atropos clean
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/README b/tools/sched_ext/README
index 4a748aaacb20e..33f413f8a4034 100644
--- a/tools/sched_ext/README
+++ b/tools/sched_ext/README
@@ -94,8 +94,8 @@ architectures and workloads.
 
 --------------------------------------------------------------------------------
 
-scx_example_central
--------------------
+scx_central
+-----------
 
 Overview
 ~~~~~~~~
@@ -123,8 +123,8 @@ and does not yet have any kind of priority mechanism.
 
 --------------------------------------------------------------------------------
 
-scx_example_flatcg
-------------------
+scx_flatcg
+----------
 
 Overview
 ~~~~~~~~
@@ -152,8 +152,8 @@ able to consume more CPU cycles than they are entitled to.
 
 --------------------------------------------------------------------------------
 
-scx_example_pair
-----------------
+scx_pair
+--------
 
 Overview
 ~~~~~~~~
@@ -178,8 +178,8 @@ No
 
 --------------------------------------------------------------------------------
 
-scx_example_qmap
-----------------
+scx_qmap
+--------
 
 Overview
 ~~~~~~~~
@@ -201,15 +201,15 @@ No
 
 --------------------------------------------------------------------------------
 
-scx_example_simple
-------------------
+scx_simple
+----------
 
 Overview
 ~~~~~~~~
 
-A simple scheduler that provides an example of a minimal sched_ext scheduler.
-scx_example_simple can be run in either global weighted vtime mode, or FIFO
-mode.
+A simple scheduler that provides an example of a minimal sched_ext
+scheduler. scx_simple can be run in either global weighted vtime mode, or
+FIFO mode.
 
 Typical Use Case
 ~~~~~~~~~~~~~~~~
@@ -228,8 +228,8 @@ simple scheduling policy.
 
 --------------------------------------------------------------------------------
 
-scx_example_userland
---------------------
+scx_userland
+------------
 
 Overview
 ~~~~~~~~
@@ -259,6 +259,6 @@ Production Ready?
 ~~~~~~~~~~~~~~~~~
 
 No. This scheduler uses an ordered list for vtime scheduling, and is stricly
-less performant than just using something like `scx_example_simple`. It is
-purely meant to illustrate that it's possible to build a user space scheduler
-on top of sched_ext.
+less performant than just using something like `scx_simple`. It is purely
+meant to illustrate that it's possible to build a user space scheduler on
+top of sched_ext.
diff --git a/tools/sched_ext/atropos/.gitignore b/tools/sched_ext/scx_atropos/.gitignore
similarity index 100%
rename from tools/sched_ext/atropos/.gitignore
rename to tools/sched_ext/scx_atropos/.gitignore
diff --git a/tools/sched_ext/atropos/Cargo.toml b/tools/sched_ext/scx_atropos/Cargo.toml
similarity index 100%
rename from tools/sched_ext/atropos/Cargo.toml
rename to tools/sched_ext/scx_atropos/Cargo.toml
diff --git a/tools/sched_ext/atropos/build.rs b/tools/sched_ext/scx_atropos/build.rs
similarity index 100%
rename from tools/sched_ext/atropos/build.rs
rename to tools/sched_ext/scx_atropos/build.rs
diff --git a/tools/sched_ext/atropos/rustfmt.toml b/tools/sched_ext/scx_atropos/rustfmt.toml
similarity index 100%
rename from tools/sched_ext/atropos/rustfmt.toml
rename to tools/sched_ext/scx_atropos/rustfmt.toml
diff --git a/tools/sched_ext/atropos/src/atropos_sys.rs b/tools/sched_ext/scx_atropos/src/atropos_sys.rs
similarity index 100%
rename from tools/sched_ext/atropos/src/atropos_sys.rs
rename to tools/sched_ext/scx_atropos/src/atropos_sys.rs
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/scx_atropos/src/bpf/atropos.bpf.c
similarity index 100%
rename from tools/sched_ext/atropos/src/bpf/atropos.bpf.c
rename to tools/sched_ext/scx_atropos/src/bpf/atropos.bpf.c
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/scx_atropos/src/bpf/atropos.h
similarity index 100%
rename from tools/sched_ext/atropos/src/bpf/atropos.h
rename to tools/sched_ext/scx_atropos/src/bpf/atropos.h
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/scx_atropos/src/main.rs
similarity index 100%
rename from tools/sched_ext/atropos/src/main.rs
rename to tools/sched_ext/scx_atropos/src/main.rs
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
similarity index 100%
rename from tools/sched_ext/scx_example_central.bpf.c
rename to tools/sched_ext/scx_central.bpf.c
diff --git a/tools/sched_ext/scx_example_central.c b/tools/sched_ext/scx_central.c
similarity index 91%
rename from tools/sched_ext/scx_example_central.c
rename to tools/sched_ext/scx_central.c
index 7ad591cbdc65c..7481d3c9123a8 100644
--- a/tools/sched_ext/scx_example_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -12,7 +12,7 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
-#include "scx_example_central.skel.h"
+#include "scx_central.skel.h"
 
 const char help_fmt[] =
 "A central FIFO sched_ext scheduler.\n"
@@ -34,7 +34,7 @@ static void sigint_handler(int dummy)
 
 int main(int argc, char **argv)
 {
-	struct scx_example_central *skel;
+	struct scx_central *skel;
 	struct bpf_link *link;
 	u64 seq = 0;
 	s32 opt;
@@ -44,7 +44,7 @@ int main(int argc, char **argv)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	skel = scx_example_central__open();
+	skel = scx_central__open();
 	assert(skel);
 
 	skel->rodata->central_cpu = 0;
@@ -64,7 +64,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_example_central__load(skel));
+	assert(!scx_central__load(skel));
 
 	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
 	assert(link);
@@ -89,6 +89,6 @@ int main(int argc, char **argv)
 
 	bpf_link__destroy(link);
 	uei_print(&skel->bss->uei);
-	scx_example_central__destroy(skel);
+	scx_central__destroy(skel);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_example_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
similarity index 99%
rename from tools/sched_ext/scx_example_flatcg.bpf.c
rename to tools/sched_ext/scx_flatcg.bpf.c
index e79f941d588d9..6d8c6f396577a 100644
--- a/tools/sched_ext/scx_example_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -45,7 +45,7 @@
  */
 #include "scx_common.bpf.h"
 #include "user_exit_info.h"
-#include "scx_example_flatcg.h"
+#include "scx_flatcg.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_example_flatcg.c b/tools/sched_ext/scx_flatcg.c
similarity index 95%
rename from tools/sched_ext/scx_example_flatcg.c
rename to tools/sched_ext/scx_flatcg.c
index f9c8a5b84a703..40aa464c55b1a 100644
--- a/tools/sched_ext/scx_example_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
-#include "scx_example_flatcg.h"
-#include "scx_example_flatcg.skel.h"
+#include "scx_flatcg.h"
+#include "scx_flatcg.skel.h"
 
 #ifndef FILEID_KERNFS
 #define FILEID_KERNFS		0xfe
@@ -91,7 +91,7 @@ static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
 	return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
 }
 
-static void fcg_read_stats(struct scx_example_flatcg *skel, __u64 *stats)
+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
 {
 	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
 	__u32 idx;
@@ -112,7 +112,7 @@ static void fcg_read_stats(struct scx_example_flatcg *skel, __u64 *stats)
 
 int main(int argc, char **argv)
 {
-	struct scx_example_flatcg *skel;
+	struct scx_flatcg *skel;
 	struct bpf_link *link;
 	struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
 	bool dump_cgrps = false;
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	skel = scx_example_flatcg__open();
+	skel = scx_flatcg__open();
 	if (!skel) {
 		fprintf(stderr, "Failed to open: %s\n", strerror(errno));
 		return 1;
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
 	       dump_cgrps);
 
-	if (scx_example_flatcg__load(skel)) {
+	if (scx_flatcg__load(skel)) {
 		fprintf(stderr, "Failed to load: %s\n", strerror(errno));
 		return 1;
 	}
@@ -227,6 +227,6 @@ int main(int argc, char **argv)
 
 	bpf_link__destroy(link);
 	uei_print(&skel->bss->uei);
-	scx_example_flatcg__destroy(skel);
+	scx_flatcg__destroy(skel);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_example_flatcg.h b/tools/sched_ext/scx_flatcg.h
similarity index 100%
rename from tools/sched_ext/scx_example_flatcg.h
rename to tools/sched_ext/scx_flatcg.h
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c
similarity index 99%
rename from tools/sched_ext/scx_example_pair.bpf.c
rename to tools/sched_ext/scx_pair.bpf.c
index 078bdd94c9877..cda126980ed51 100644
--- a/tools/sched_ext/scx_example_pair.bpf.c
+++ b/tools/sched_ext/scx_pair.bpf.c
@@ -116,7 +116,7 @@
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #include "scx_common.bpf.h"
-#include "scx_example_pair.h"
+#include "scx_pair.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_example_pair.c b/tools/sched_ext/scx_pair.c
similarity index 94%
rename from tools/sched_ext/scx_example_pair.c
rename to tools/sched_ext/scx_pair.c
index 18e032bbc173b..b35e4f511de6d 100644
--- a/tools/sched_ext/scx_example_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -12,8 +12,8 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
-#include "scx_example_pair.h"
-#include "scx_example_pair.skel.h"
+#include "scx_pair.h"
+#include "scx_pair.skel.h"
 
 const char help_fmt[] =
 "A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
@@ -36,7 +36,7 @@ static void sigint_handler(int dummy)
 
 int main(int argc, char **argv)
 {
-	struct scx_example_pair *skel;
+	struct scx_pair *skel;
 	struct bpf_link *link;
 	u64 seq = 0;
 	s32 stride, i, opt, outer_fd;
@@ -46,7 +46,7 @@ int main(int argc, char **argv)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	skel = scx_example_pair__open();
+	skel = scx_pair__open();
 	assert(skel);
 
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_example_pair__load(skel));
+	assert(!scx_pair__load(skel));
 
 	/*
 	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
@@ -138,6 +138,6 @@ int main(int argc, char **argv)
 
 	bpf_link__destroy(link);
 	uei_print(&skel->bss->uei);
-	scx_example_pair__destroy(skel);
+	scx_pair__destroy(skel);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_example_pair.h b/tools/sched_ext/scx_pair.h
similarity index 100%
rename from tools/sched_ext/scx_example_pair.h
rename to tools/sched_ext/scx_pair.h
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
similarity index 100%
rename from tools/sched_ext/scx_example_qmap.bpf.c
rename to tools/sched_ext/scx_qmap.bpf.c
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_qmap.c
similarity index 94%
rename from tools/sched_ext/scx_example_qmap.c
rename to tools/sched_ext/scx_qmap.c
index ccb4814ee61ba..0a02aa166b478 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -13,7 +13,7 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
-#include "scx_example_qmap.skel.h"
+#include "scx_qmap.skel.h"
 
 const char help_fmt[] =
 "A simple five-level FIFO queue sched_ext scheduler.\n"
@@ -40,7 +40,7 @@ static void sigint_handler(int dummy)
 
 int main(int argc, char **argv)
 {
-	struct scx_example_qmap *skel;
+	struct scx_qmap *skel;
 	struct bpf_link *link;
 	int opt;
 
@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	skel = scx_example_qmap__open();
+	skel = scx_qmap__open();
 	assert(skel);
 
 	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) {
@@ -83,7 +83,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_example_qmap__load(skel));
+	assert(!scx_qmap__load(skel));
 
 	link = bpf_map__attach_struct_ops(skel->maps.qmap_ops);
 	assert(link);
@@ -102,6 +102,6 @@ int main(int argc, char **argv)
 
 	bpf_link__destroy(link);
 	uei_print(&skel->bss->uei);
-	scx_example_qmap__destroy(skel);
+	scx_qmap__destroy(skel);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_example_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
similarity index 100%
rename from tools/sched_ext/scx_example_simple.bpf.c
rename to tools/sched_ext/scx_simple.bpf.c
diff --git a/tools/sched_ext/scx_example_simple.c b/tools/sched_ext/scx_simple.c
similarity index 88%
rename from tools/sched_ext/scx_example_simple.c
rename to tools/sched_ext/scx_simple.c
index 486b401f7c951..4b2f0c16a9d1e 100644
--- a/tools/sched_ext/scx_example_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -12,7 +12,7 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
-#include "scx_example_simple.skel.h"
+#include "scx_simple.skel.h"
 
 const char help_fmt[] =
 "A simple sched_ext scheduler.\n"
@@ -32,7 +32,7 @@ static void sigint_handler(int simple)
 	exit_req = 1;
 }
 
-static void read_stats(struct scx_example_simple *skel, u64 *stats)
+static void read_stats(struct scx_simple *skel, u64 *stats)
 {
 	int nr_cpus = libbpf_num_possible_cpus();
 	u64 cnts[2][nr_cpus];
@@ -54,7 +54,7 @@ static void read_stats(struct scx_example_simple *skel, u64 *stats)
 
 int main(int argc, char **argv)
 {
-	struct scx_example_simple *skel;
+	struct scx_simple *skel;
 	struct bpf_link *link;
 	u32 opt;
 
@@ -63,7 +63,7 @@ int main(int argc, char **argv)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	skel = scx_example_simple__open();
+	skel = scx_simple__open();
 	assert(skel);
 
 	while ((opt = getopt(argc, argv, "fph")) != -1) {
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_example_simple__load(skel));
+	assert(!scx_simple__load(skel));
 
 	link = bpf_map__attach_struct_ops(skel->maps.simple_ops);
 	assert(link);
@@ -96,6 +96,6 @@ int main(int argc, char **argv)
 
 	bpf_link__destroy(link);
 	uei_print(&skel->bss->uei);
-	scx_example_simple__destroy(skel);
+	scx_simple__destroy(skel);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
similarity index 99%
rename from tools/sched_ext/scx_example_userland.bpf.c
rename to tools/sched_ext/scx_userland.bpf.c
index b62cce0b54e1b..9e107a874a92d 100644
--- a/tools/sched_ext/scx_example_userland.bpf.c
+++ b/tools/sched_ext/scx_userland.bpf.c
@@ -22,7 +22,7 @@
  */
 #include <string.h>
 #include "scx_common.bpf.h"
-#include "scx_example_userland_common.h"
+#include "scx_userland.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_example_userland.c b/tools/sched_ext/scx_userland.c
similarity index 97%
rename from tools/sched_ext/scx_example_userland.c
rename to tools/sched_ext/scx_userland.c
index 4152b1e65fe1a..a63adae74f21f 100644
--- a/tools/sched_ext/scx_example_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -29,8 +29,8 @@
 #include <sys/syscall.h>
 
 #include "user_exit_info.h"
-#include "scx_example_userland_common.h"
-#include "scx_example_userland.skel.h"
+#include "scx_userland.h"
+#include "scx_userland.skel.h"
 
 const char help_fmt[] =
 "A minimal userland sched_ext scheduler.\n"
@@ -52,7 +52,7 @@ static __u32 batch_size = 8;
 static volatile int exit_req;
 static int enqueued_fd, dispatched_fd;
 
-static struct scx_example_userland *skel;
+static struct scx_userland *skel;
 static struct bpf_link *ops_link;
 
 /* Stats collected in user space. */
@@ -316,7 +316,7 @@ static int bootstrap(int argc, char **argv)
 		return err;
 	}
 
-	skel = scx_example_userland__open();
+	skel = scx_userland__open();
 	if (!skel) {
 		fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno));
 		return errno;
@@ -327,7 +327,7 @@ static int bootstrap(int argc, char **argv)
 	assert(skel->rodata->usersched_pid > 0);
 	skel->rodata->switch_partial = switch_partial;
 
-	err = scx_example_userland__load(skel);
+	err = scx_userland__load(skel);
 	if (err) {
 		fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err));
 		goto destroy_skel;
@@ -354,7 +354,7 @@ static int bootstrap(int argc, char **argv)
 	return 0;
 
 destroy_skel:
-	scx_example_userland__destroy(skel);
+	scx_userland__destroy(skel);
 	exit_req = 1;
 	return err;
 }
@@ -397,6 +397,6 @@ int main(int argc, char **argv)
 	exit_req = 1;
 	bpf_link__destroy(ops_link);
 	uei_print(&skel->bss->uei);
-	scx_example_userland__destroy(skel);
+	scx_userland__destroy(skel);
 	return 0;
 }
diff --git a/tools/sched_ext/scx_example_userland_common.h b/tools/sched_ext/scx_userland.h
similarity index 100%
rename from tools/sched_ext/scx_example_userland_common.h
rename to tools/sched_ext/scx_userland.h

From 6bd10f3beddbe1f1377e71ebdde856000a4bfb13 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jun 2023 15:40:07 -1000
Subject: [PATCH 077/304] SCX: A couple cosmetic updates

---
 kernel/sched/ext.c |  5 ++---
 kernel/sched/ext.h | 10 +++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6b762bf2fa51c..7da59604b0024 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3723,9 +3723,8 @@ __diag_ignore_all("-Wmissing-prototypes",
 /**
  * scx_bpf_switch_all - Switch all tasks into SCX
  *
- * Switch all existing and future non-dl/rt tasks to SCX.
- * This can only be called from ops.init(), and actual switching
- * is performed asynchronously.
+ * Switch all existing and future non-dl/rt tasks to SCX. This can only be
+ * called from ops.init(), and actual switching is performed asynchronously.
  */
 void scx_bpf_switch_all(void)
 {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 7e2900b8f6f21..5001bcfec570c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -81,9 +81,8 @@ enum scx_deq_flags {
 	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
 };
 
-enum scx_tg_flags {
-	SCX_TG_ONLINE		= 1U << 0,
-	SCX_TG_INITED		= 1U << 1,
+enum scx_pick_idle_cpu_flags {
+	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
 };
 
 enum scx_kick_flags {
@@ -91,8 +90,9 @@ enum scx_kick_flags {
 	SCX_KICK_WAIT		= 1LLU << 1,	/* wait for the CPU to be rescheduled */
 };
 
-enum scx_pick_idle_cpu_flags {
-	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
+enum scx_tg_flags {
+	SCX_TG_ONLINE		= 1U << 0,
+	SCX_TG_INITED		= 1U << 1,
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT

From 3bce5ab15ff327a0b8ef7a7cb83cfe969d85c421 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jun 2023 16:50:27 -1000
Subject: [PATCH 078/304] sched, SCX: Drop SCHED_CHANGE_BLOCK

Upstream is adopting a generic guard block mechanism. Let's drop
SCHED_CHANGE_BLOCK from SCX patchset so that it's easier to adapt to the new
mechanism later.

No functional change intended.
---
 kernel/sched/core.c  | 264 +++++++++++++++++++++++++++----------------
 kernel/sched/ext.c   |  41 ++++---
 kernel/sched/ext.h   |  11 ++
 kernel/sched/sched.h |  41 -------
 4 files changed, 202 insertions(+), 155 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3ff57b0b2bc84..98236b1bec498 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2132,40 +2132,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_task(rq, p, flags);
 }
 
-struct sched_change_guard
-sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags)
-{
-	struct sched_change_guard cg = {
-		.rq = rq,
-		.p = p,
-		.queued = task_on_rq_queued(p),
-		.running = task_current(rq, p),
-	};
-
-	if (cg.queued) {
-		/*
-		 * __kthread_bind() may call this on blocked tasks without
-		 * holding rq->lock through __do_set_cpus_allowed(). Assert @rq
-		 * locked iff @p is queued.
-		 */
-		lockdep_assert_rq_held(rq);
-		dequeue_task(rq, p, flags);
-	}
-	if (cg.running)
-		put_prev_task(rq, p);
-
-	return cg;
-}
-
-void sched_change_guard_fini(struct sched_change_guard *cg, int flags)
-{
-	if (cg->queued)
-		enqueue_task(cg->rq, cg->p, flags | ENQUEUE_NOCLOCK);
-	if (cg->running)
-		set_next_task(cg->rq, cg->p);
-	cg->done = true;
-}
-
 static inline int __normal_prio(int policy, int rt_prio, int nice)
 {
 	int prio;
@@ -2635,6 +2601,7 @@ static void
 __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 {
 	struct rq *rq = task_rq(p);
+	bool queued, running;
 
 	/*
 	 * This here violates the locking rules for affinity, since we're only
@@ -2653,9 +2620,26 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 	else
 		lockdep_assert_held(&p->pi_lock);
 
-	SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
-		p->sched_class->set_cpus_allowed(p, ctx);
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_rq_held(rq);
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
 	}
+	if (running)
+		put_prev_task(rq, p);
+
+	p->sched_class->set_cpus_allowed(p, ctx);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_next_task(rq, p);
 }
 
 /*
@@ -7132,7 +7116,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
  */
 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int prio, oldprio, queue_flag =
+	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class;
 	struct rq_flags rf;
@@ -7192,41 +7176,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
-	SCHED_CHANGE_BLOCK(rq, p, queue_flag) {
-		/*
-		 * Boosting condition are:
-		 * 1. -rt task is running and holds mutex A
-		 *      --> -dl task blocks on mutex A
-		 *
-		 * 2. -dl task is running and holds mutex A
-		 *      --> -dl task blocks on mutex A and could preempt the
-		 *          running task
-		 */
-		if (dl_prio(prio)) {
-			if (!dl_prio(p->normal_prio) ||
-			    (pi_task && dl_prio(pi_task->prio) &&
-			     dl_entity_preempt(&pi_task->dl, &p->dl))) {
-				p->dl.pi_se = pi_task->dl.pi_se;
-				queue_flag |= ENQUEUE_REPLENISH;
-			} else {
-				p->dl.pi_se = &p->dl;
-			}
-		} else if (rt_prio(prio)) {
-			if (dl_prio(oldprio))
-				p->dl.pi_se = &p->dl;
-			if (oldprio < prio)
-				queue_flag |= ENQUEUE_HEAD;
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+	if (queued)
+		dequeue_task(rq, p, queue_flag);
+	if (running)
+		put_prev_task(rq, p);
+
+	/*
+	 * Boosting condition are:
+	 * 1. -rt task is running and holds mutex A
+	 *      --> -dl task blocks on mutex A
+	 *
+	 * 2. -dl task is running and holds mutex A
+	 *      --> -dl task blocks on mutex A and could preempt the
+	 *          running task
+	 */
+	if (dl_prio(prio)) {
+		if (!dl_prio(p->normal_prio) ||
+		    (pi_task && dl_prio(pi_task->prio) &&
+		     dl_entity_preempt(&pi_task->dl, &p->dl))) {
+			p->dl.pi_se = pi_task->dl.pi_se;
+			queue_flag |= ENQUEUE_REPLENISH;
 		} else {
-			if (dl_prio(oldprio))
-				p->dl.pi_se = &p->dl;
-			if (rt_prio(oldprio))
-				p->rt.timeout = 0;
+			p->dl.pi_se = &p->dl;
 		}
-
-		__setscheduler_prio(p, prio);
-		check_class_changing(rq, p, prev_class);
+	} else if (rt_prio(prio)) {
+		if (dl_prio(oldprio))
+			p->dl.pi_se = &p->dl;
+		if (oldprio < prio)
+			queue_flag |= ENQUEUE_HEAD;
+	} else {
+		if (dl_prio(oldprio))
+			p->dl.pi_se = &p->dl;
+		if (rt_prio(oldprio))
+			p->rt.timeout = 0;
 	}
 
+	__setscheduler_prio(p, prio);
+	check_class_changing(rq, p, prev_class);
+
+	if (queued)
+		enqueue_task(rq, p, queue_flag);
+	if (running)
+		set_next_task(rq, p);
+
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
 	/* Avoid rq from going away on us: */
@@ -7247,6 +7241,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
+	bool queued, running;
 	int old_prio;
 	struct rq_flags rf;
 	struct rq *rq;
@@ -7270,13 +7265,22 @@ void set_user_nice(struct task_struct *p, long nice)
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+	if (queued)
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	if (running)
+		put_prev_task(rq, p);
 
-	SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
-		p->static_prio = NICE_TO_PRIO(nice);
-		set_load_weight(p, true);
-		old_prio = p->prio;
-		p->prio = effective_prio(p);
-	}
+	p->static_prio = NICE_TO_PRIO(nice);
+	set_load_weight(p, true);
+	old_prio = p->prio;
+	p->prio = effective_prio(p);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_next_task(rq, p);
 
 	/*
 	 * If the task increased its priority or is running and
@@ -7660,7 +7664,7 @@ static int __sched_setscheduler(struct task_struct *p,
 				bool user, bool pi)
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
-	int retval, oldprio, newprio;
+	int retval, oldprio, newprio, queued, running;
 	const struct sched_class *prev_class;
 	struct balance_callback *head;
 	struct rq_flags rf;
@@ -7829,24 +7833,34 @@ static int __sched_setscheduler(struct task_struct *p,
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
-	SCHED_CHANGE_BLOCK(rq, p, queue_flags) {
-		prev_class = p->sched_class;
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+	if (queued)
+		dequeue_task(rq, p, queue_flags);
+	if (running)
+		put_prev_task(rq, p);
 
-		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
-			__setscheduler_params(p, attr);
-			__setscheduler_prio(p, newprio);
-		}
-		__setscheduler_uclamp(p, attr);
+	prev_class = p->sched_class;
 
-		check_class_changing(rq, p, prev_class);
+	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+		__setscheduler_params(p, attr);
+		__setscheduler_prio(p, newprio);
+	}
+	__setscheduler_uclamp(p, attr);
+	check_class_changing(rq, p, prev_class);
 
+	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
 		if (oldprio < p->prio)
 			queue_flags |= ENQUEUE_HEAD;
+
+		enqueue_task(rq, p, queue_flags);
 	}
+	if (running)
+		set_next_task(rq, p);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 
@@ -9419,15 +9433,25 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
+	bool queued, running;
 	struct rq_flags rf;
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &rf);
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
 
-	SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE) {
-		p->numa_preferred_nid = nid;
-	}
+	if (queued)
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+	if (running)
+		put_prev_task(rq, p);
 
+	p->numa_preferred_nid = nid;
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_next_task(rq, p);
 	task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -10534,6 +10558,8 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
  */
 void sched_move_task(struct task_struct *tsk)
 {
+	int queued, running, queue_flags =
+		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct task_group *group;
 	struct rq_flags rf;
 	struct rq *rq;
@@ -10549,19 +10575,28 @@ void sched_move_task(struct task_struct *tsk)
 
 	update_rq_clock(rq);
 
-	SCHED_CHANGE_BLOCK(rq, tsk,
-			   DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
-		sched_change_group(tsk, group);
-		scx_move_task(tsk);
-	}
+	running = task_current(rq, tsk);
+	queued = task_on_rq_queued(tsk);
 
-	/*
-	 * After changing group, the running task may have joined a throttled
-	 * one but it's still the running task. Trigger a resched to make sure
-	 * that task can still run.
-	 */
-	if (task_current(rq, tsk))
+	if (queued)
+		dequeue_task(rq, tsk, queue_flags);
+	if (running)
+		put_prev_task(rq, tsk);
+
+	sched_change_group(tsk, group);
+	scx_move_task(tsk);
+
+	if (queued)
+		enqueue_task(rq, tsk, queue_flags);
+	if (running) {
+		set_next_task(rq, tsk);
+		/*
+		 * After changing group, the running task may have joined a
+		 * throttled one but it's still the running task. Trigger a
+		 * resched to make sure that task can still run.
+		 */
 		resched_curr(rq);
+	}
 
 unlock:
 	task_rq_unlock(rq, tsk, &rf);
@@ -12121,3 +12156,38 @@ void sched_mm_cid_fork(struct task_struct *t)
 	t->mm_cid_active = 1;
 }
 #endif
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+			    struct sched_enq_and_set_ctx *ctx)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_rq_held(rq);
+
+	*ctx = (struct sched_enq_and_set_ctx){
+		.p = p,
+		.queue_flags = queue_flags | DEQUEUE_NOCLOCK,
+		.queued = task_on_rq_queued(p),
+		.running = task_current(rq, p),
+	};
+
+	update_rq_clock(rq);
+	if (ctx->queued)
+		dequeue_task(rq, p, queue_flags);
+	if (ctx->running)
+		put_prev_task(rq, p);
+}
+
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+{
+	struct rq *rq = task_rq(ctx->p);
+
+	lockdep_assert_rq_held(rq);
+
+	if (ctx->queued)
+		enqueue_task(rq, ctx->p, ctx->queue_flags);
+	if (ctx->running)
+		set_next_task(rq, ctx->p);
+}
+#endif	/* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7da59604b0024..7906ba5e1564b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2984,10 +2984,11 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_task_iter_init(&sti);
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
-			SCHED_CHANGE_BLOCK(task_rq(p), p,
-					   DEQUEUE_SAVE | DEQUEUE_MOVE) {
-				/* cycling deq/enq is enough, see above */
-			}
+			struct sched_enq_and_set_ctx ctx;
+
+			/* cycling deq/enq is enough, see above */
+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+			sched_enq_and_set_task(&ctx);
 		}
 	}
 	scx_task_iter_exit(&sti);
@@ -3018,18 +3019,21 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		const struct sched_class *old_class = p->sched_class;
 		struct rq *rq = task_rq(p);
+		struct sched_enq_and_set_ctx ctx;
 		bool alive = READ_ONCE(p->__state) != TASK_DEAD;
 
 		update_rq_clock(rq);
 
-		SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_MOVE |
-				   DEQUEUE_NOCLOCK) {
-			p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
+		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE |
+				       DEQUEUE_NOCLOCK, &ctx);
 
-			__setscheduler_prio(p, p->prio);
-			if (alive)
-				check_class_changing(task_rq(p), p, old_class);
-		}
+		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
+
+		__setscheduler_prio(p, p->prio);
+		if (alive)
+			check_class_changing(task_rq(p), p, old_class);
+
+		sched_enq_and_set_task(&ctx);
 
 		if (alive)
 			check_class_changed(task_rq(p), p, old_class, p->prio);
@@ -3333,15 +3337,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
 			const struct sched_class *old_class = p->sched_class;
 			struct rq *rq = task_rq(p);
+			struct sched_enq_and_set_ctx ctx;
 
 			update_rq_clock(rq);
 
-			SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_MOVE |
-					   DEQUEUE_NOCLOCK) {
-				scx_ops_enable_task(p);
-				__setscheduler_prio(p, p->prio);
-				check_class_changing(task_rq(p), p, old_class);
-			}
+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE |
+					       DEQUEUE_NOCLOCK, &ctx);
+
+			scx_ops_enable_task(p);
+			__setscheduler_prio(p, p->prio);
+			check_class_changing(task_rq(p), p, old_class);
+
+			sched_enq_and_set_task(&ctx);
 
 			check_class_changed(task_rq(p), p, old_class, p->prio);
 		} else {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 5001bcfec570c..405037a4e6ce7 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -97,6 +97,17 @@ enum scx_tg_flags {
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 
+struct sched_enq_and_set_ctx {
+	struct task_struct	*p;
+	int			queue_flags;
+	bool			queued;
+	bool			running;
+};
+
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+			    struct sched_enq_and_set_ctx *ctx);
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+
 extern const struct sched_class ext_sched_class;
 extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
 extern const struct file_operations sched_ext_fops;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2876a981c52..e7b15bd7adbc2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2527,47 +2527,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
-struct sched_change_guard {
-	struct task_struct	*p;
-	struct rq		*rq;
-	bool			queued;
-	bool			running;
-	bool			done;
-};
-
-extern struct sched_change_guard
-sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags);
-
-extern void sched_change_guard_fini(struct sched_change_guard *cg, int flags);
-
-/**
- * SCHED_CHANGE_BLOCK - Nested block for task attribute updates
- * @__rq: Runqueue the target task belongs to
- * @__p: Target task
- * @__flags: DEQUEUE/ENQUEUE_* flags
- *
- * A task may need to be dequeued and put_prev_task'd for attribute updates and
- * set_next_task'd and re-enqueued afterwards. This helper defines a nested
- * block which automatically handles these preparation and cleanup operations.
- *
- *  SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
- *	  update_attribute(p);
- *        ...
- *  }
- *
- * If @__flags is a variable, the variable may be updated in the block body and
- * the updated value will be used when re-enqueueing @p.
- *
- * If %DEQUEUE_NOCLOCK is specified, the caller is responsible for calling
- * update_rq_clock() beforehand. Otherwise, the rq clock is automatically
- * updated iff the task needs to be dequeued and re-enqueued. Only the former
- * case guarantees that the rq clock is up-to-date inside and after the block.
- */
-#define SCHED_CHANGE_BLOCK(__rq, __p, __flags)					\
-	for (struct sched_change_guard __cg =					\
-			sched_change_guard_init(__rq, __p, __flags);		\
-	     !__cg.done; sched_change_guard_fini(&__cg, __flags))
-
 extern void check_class_changing(struct rq *rq, struct task_struct *p,
 				 const struct sched_class *prev_class);
 extern void check_class_changed(struct rq *rq, struct task_struct *p,

From 97eb64719aef365adb91ed79b7d657cfef1ed73b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 23 Jun 2023 10:48:24 -1000
Subject: [PATCH 079/304] SCX: Drop implicit DEQUEUE_NOCLOCK from
 sched_deq_and_put_task()

This snuck in while forward porting. The current users of deq_and_put manage
DEQUEUE_NOCLOCK themselves and it shouldn't be set implicitly.
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98236b1bec498..4b8245b223ab0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -12167,7 +12167,7 @@ void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
 
 	*ctx = (struct sched_enq_and_set_ctx){
 		.p = p,
-		.queue_flags = queue_flags | DEQUEUE_NOCLOCK,
+		.queue_flags = queue_flags,
 		.queued = task_on_rq_queued(p),
 		.running = task_current(rq, p),
 	};

From 8460dc07829a936ef37eeea2f15f65237b34bdd2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 23 Jun 2023 11:53:04 -1000
Subject: [PATCH 080/304] SCX: Build fixes when !CONFIG_SCHED_SMT or
 !CONFIG_CGROUP_SCHED

---
 kernel/sched/ext.c | 54 ++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7906ba5e1564b..6d524ffe37d25 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1599,6 +1599,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 
 	ret = balance_one(rq, prev, rf, true);
 
+#ifdef CONFIG_SCHED_SMT
 	/*
 	 * When core-sched is enabled, this ops.balance() call will be followed
 	 * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
@@ -1629,7 +1630,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 			rq_repin_lock(rq, rf);
 		}
 	}
-
+#endif
 	return ret;
 }
 
@@ -1925,13 +1926,14 @@ void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
 
 static bool test_and_clear_cpu_idle(int cpu)
 {
+#ifdef CONFIG_SCHED_SMT
 	/*
 	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
 	 * cluster is not wholly idle either way. This also prevents
 	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
 	 */
 	if (sched_smt_active()) {
-		const struct cpumask *sbm = topology_sibling_cpumask(cpu);
+		const struct cpumask *smt = cpu_smt_mask(cpu);
 
 		/*
 		 * If offline, @cpu is not its own sibling and
@@ -1939,12 +1941,12 @@ static bool test_and_clear_cpu_idle(int cpu)
 		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
 		 * is eventually cleared.
 		 */
-		if (cpumask_intersects(sbm, idle_masks.smt))
-			cpumask_andnot(idle_masks.smt, idle_masks.smt, sbm);
+		if (cpumask_intersects(smt, idle_masks.smt))
+			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
 		else if (cpumask_test_cpu(cpu, idle_masks.smt))
 			__cpumask_clear_cpu(cpu, idle_masks.smt);
 	}
-
+#endif
 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
 }
 
@@ -2076,7 +2078,6 @@ static void reset_idle_masks(void)
 void __scx_update_idle(struct rq *rq, bool idle)
 {
 	int cpu = cpu_of(rq);
-	struct cpumask *sib_mask = topology_sibling_cpumask(cpu);
 
 	if (SCX_HAS_OP(update_idle)) {
 		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
@@ -2084,22 +2085,30 @@ void __scx_update_idle(struct rq *rq, bool idle)
 			return;
 	}
 
-	if (idle) {
+	if (idle)
 		cpumask_set_cpu(cpu, idle_masks.cpu);
+	else
+		cpumask_clear_cpu(cpu, idle_masks.cpu);
 
-		/*
-		 * idle_masks.smt handling is racy but that's fine as it's only
-		 * for optimization and self-correcting.
-		 */
-		for_each_cpu(cpu, sib_mask) {
-			if (!cpumask_test_cpu(cpu, idle_masks.cpu))
-				return;
+#ifdef CONFIG_SCHED_SMT
+	if (sched_smt_active()) {
+		const struct cpumask *smt = cpu_smt_mask(cpu);
+
+		if (idle) {
+			/*
+			 * idle_masks.smt handling is racy but that's fine as
+			 * it's only for optimization and self-correcting.
+			 */
+			for_each_cpu(cpu, smt) {
+				if (!cpumask_test_cpu(cpu, idle_masks.cpu))
+					return;
+			}
+			cpumask_or(idle_masks.smt, idle_masks.smt, smt);
+		} else {
+			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
 		}
-		cpumask_or(idle_masks.smt, idle_masks.smt, sib_mask);
-	} else {
-		cpumask_clear_cpu(cpu, idle_masks.cpu);
-		cpumask_andnot(idle_masks.smt, idle_masks.smt, sib_mask);
 	}
+#endif
 }
 
 static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
@@ -4212,7 +4221,10 @@ const struct cpumask *scx_bpf_get_idle_smtmask(void)
 	}
 
 #ifdef CONFIG_SMP
-	return idle_masks.smt;
+	if (sched_smt_active())
+		return idle_masks.smt;
+	else
+		return idle_masks.cpu;
 #else
 	return cpu_none_mask;
 #endif
@@ -4336,6 +4348,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p)
  * rq-locked operations. Can be called on the parameter tasks of rq-locked
  * operations. The restriction guarantees that @p's rq is locked by the caller.
  */
+#ifdef CONFIG_CGROUP_SCHED
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 {
 	struct task_group *tg = p->sched_task_group;
@@ -4357,6 +4370,7 @@ struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 	cgroup_get(cgrp);
 	return cgrp;
 }
+#endif
 
 BTF_SET8_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
@@ -4371,7 +4385,9 @@ BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+#ifdef CONFIG_CGROUP_SCHED
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
+#endif
 BTF_SET8_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {

From d0a8cea8900ecca6fb2e259ca2b02393e3681807 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 26 Jun 2023 14:10:58 -1000
Subject: [PATCH 081/304] SCX: Always imply NOCLOCK in
 sched_deq_and_put_task/sched_enq_and_set_task()

SCHED_CHANGE_GUARD() was used in other places where avoiding clock updates
may be a useful optimization. However, sched_deq_and_put_task() and
sched_enq_and_set_task() pair are only used by SCX switching paths which
want the clock always updated. The recent conversion away from
SCHED_CHANGE_GUARD() left out ENQUEUE_NOCLOCK when re-enqueueing which could
lead to double rq update warnings.

Let's restore the original behavior where sched_deq_and_put_task() always
calls rq_update_clock() and the deq and enq calls implicitly set the NOCLOCK
flags.
---
 kernel/sched/core.c |  4 ++--
 kernel/sched/ext.c  | 13 +++----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4b8245b223ab0..9128160b6264b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -12174,7 +12174,7 @@ void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
 
 	update_rq_clock(rq);
 	if (ctx->queued)
-		dequeue_task(rq, p, queue_flags);
+		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
 	if (ctx->running)
 		put_prev_task(rq, p);
 }
@@ -12186,7 +12186,7 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
 	lockdep_assert_rq_held(rq);
 
 	if (ctx->queued)
-		enqueue_task(rq, ctx->p, ctx->queue_flags);
+		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
 	if (ctx->running)
 		set_next_task(rq, ctx->p);
 }
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6d524ffe37d25..0db6d400d340e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3027,14 +3027,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_task_iter_init(&sti);
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		const struct sched_class *old_class = p->sched_class;
-		struct rq *rq = task_rq(p);
 		struct sched_enq_and_set_ctx ctx;
 		bool alive = READ_ONCE(p->__state) != TASK_DEAD;
 
-		update_rq_clock(rq);
-
-		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE |
-				       DEQUEUE_NOCLOCK, &ctx);
+		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
 		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
 
@@ -3345,13 +3341,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
 			const struct sched_class *old_class = p->sched_class;
-			struct rq *rq = task_rq(p);
 			struct sched_enq_and_set_ctx ctx;
 
-			update_rq_clock(rq);
-
-			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE |
-					       DEQUEUE_NOCLOCK, &ctx);
+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
+					       &ctx);
 
 			scx_ops_enable_task(p);
 			__setscheduler_prio(p, p->prio);

From 722926821124de1893dc869a6728fe40d619f95c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Jun 2023 13:39:32 -1000
Subject: [PATCH 082/304] scx_pair: Fix custom stride error handling

scx_pair uses the default stride value of nr_cpu_ids / 2, which matches most
x86 SMT configurations. However, it does allow specifying a custom stride
value with -S so that e.g. neighboring CPUs can be paired up. However, not
all stride values work and errors were not reported very well.

This patch improves error handling so that scx_pair fails with clear error
message if CPUs can't be paired up with the specified stride value. scx_pair
now also prints out how CPUs are paired on startup.

This should address issues #28 and #29.
---
 tools/sched_ext/scx_pair.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index b35e4f511de6d..4d24fcedc2cd0 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -68,16 +68,37 @@ int main(int argc, char **argv)
 		}
 	}
 
+	printf("Pairs: ");
 	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
-		if (skel->rodata->pair_cpu[i] < 0) {
-			skel->rodata->pair_cpu[i] = i + stride;
-			skel->rodata->pair_cpu[i + stride] = i;
-			skel->rodata->pair_id[i] = i;
-			skel->rodata->pair_id[i + stride] = i;
-			skel->rodata->in_pair_idx[i] = 0;
-			skel->rodata->in_pair_idx[i + stride] = 1;
+		int j = (i + stride) % skel->rodata->nr_cpu_ids;
+
+		if (skel->rodata->pair_cpu[i] >= 0)
+			continue;
+
+		if (i == j) {
+			printf("\n");
+			fprintf(stderr, "Invalid stride %d - CPU%d wants to be its own pair\n",
+				stride, i);
+			return 1;
+		}
+
+		if (skel->rodata->pair_cpu[j] >= 0) {
+			printf("\n");
+			fprintf(stderr, "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair\n",
+				stride, i, j, skel->rodata->pair_cpu[j]);
+			return 1;
 		}
+
+		skel->rodata->pair_cpu[i] = j;
+		skel->rodata->pair_cpu[j] = i;
+		skel->rodata->pair_id[i] = i;
+		skel->rodata->pair_id[j] = i;
+		skel->rodata->in_pair_idx[i] = 0;
+		skel->rodata->in_pair_idx[j] = 1;
+
+		printf("[%d, %d] ", i, j);
 	}
+	printf("\n");
 
 	assert(!scx_pair__load(skel));
 

From e348fae2865c07d09f73dd0eb2ce40e8d2870717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20M=C3=BCller?= <deso@posteo.net>
Date: Tue, 11 Jul 2023 12:34:29 -0700
Subject: [PATCH 083/304] Update various dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update libbpf-rs and libbpf-cargo to 0.21, which is the most recent
version. Also update libbpf-sys to 1.2.0, which is a requirement for
.struct_ops.link support that the program uses (and which got added with
libbpf 1.2).

Signed-off-by: Daniel Müller <deso@posteo.net>
---
 tools/sched_ext/scx_atropos/Cargo.toml  |  6 +++---
 tools/sched_ext/scx_atropos/src/main.rs | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/sched_ext/scx_atropos/Cargo.toml b/tools/sched_ext/scx_atropos/Cargo.toml
index a5ab02cb55f86..f9890092b8a9c 100644
--- a/tools/sched_ext/scx_atropos/Cargo.toml
+++ b/tools/sched_ext/scx_atropos/Cargo.toml
@@ -12,8 +12,8 @@ bitvec = { version = "1.0", features = ["serde"] }
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
 hex = "0.4.3"
-libbpf-rs = "0.19.1"
-libbpf-sys = { version = "1.0.4", features = ["novendor", "static"] }
+libbpf-rs = "0.21.0"
+libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
 libc = "0.2.137"
 log = "0.4.17"
 ordered-float = "3.4.0"
@@ -21,7 +21,7 @@ simplelog = "0.12.0"
 
 [build-dependencies]
 bindgen = { version = "0.61.0" }
-libbpf-cargo = "0.13.0"
+libbpf-cargo = "0.21.0"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_atropos/src/main.rs b/tools/sched_ext/scx_atropos/src/main.rs
index 6d8ea6f4ef3c0..3d1fc845c5be8 100644
--- a/tools/sched_ext/scx_atropos/src/main.rs
+++ b/tools/sched_ext/scx_atropos/src/main.rs
@@ -25,6 +25,9 @@ use anyhow::Context;
 use anyhow::Result;
 use bitvec::prelude::*;
 use clap::Parser;
+use libbpf_rs::skel::OpenSkel as _;
+use libbpf_rs::skel::Skel as _;
+use libbpf_rs::skel::SkelBuilder as _;
 use log::info;
 use log::trace;
 use log::warn;
@@ -140,12 +143,9 @@ fn now_monotonic() -> u64 {
     time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64
 }
 
-fn clear_map(map: &mut libbpf_rs::Map) {
-    // XXX: libbpf_rs has some design flaw that make it impossible to
-    // delete while iterating despite it being safe so we alias it here
-    let deleter: &mut libbpf_rs::Map = unsafe { &mut *(map as *mut _) };
+fn clear_map(map: &libbpf_rs::Map) {
     for key in map.keys() {
-        let _ = deleter.delete(&key);
+        let _ = map.delete(&key);
     }
 }
 

From 56b278fa8b5136457993f7389e34070d35f17e8a Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 12 Jul 2023 13:59:50 -0500
Subject: [PATCH 084/304] scx: Make task_on_scx() take const task_struct

task_on_scx() takes a struct task_struct *p, but it's called by
__task_prio() in CONFIG_SCHED_CORE which takes a const struct
task_struct *. This can cause the build to fail due to incompatible
pointer types:

kernel/sched/core.c:170:18: error: passing 'const struct task_struct *' to parameter of type 'struct task_struct *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers]
        if (task_on_scx(p))
                        ^
kernel/sched/ext.h:124:52: note: passing argument to parameter 'p' here
static inline bool task_on_scx(struct task_struct *p)

Fix task_on_scx() so this doesn't happen.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 405037a4e6ce7..e9c699a87770f 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -121,7 +121,7 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 
 DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 
-static inline bool task_on_scx(struct task_struct *p)
+static inline bool task_on_scx(const struct task_struct *p)
 {
 	return scx_enabled() && p->sched_class == &ext_sched_class;
 }
@@ -214,7 +214,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 #define scx_enabled()		false
 #define scx_switched_all()	false
 
-static inline bool task_on_scx(struct task_struct *p) { return false; }
+static inline bool task_on_scx(const struct task_struct *p) { return false; }
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}

From b4286277ac910774cde4c2facaf9f303d8dd09ea Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Fri, 14 Jul 2023 12:03:26 -0700
Subject: [PATCH 085/304] scx: atropos: Use fb-procfs crate

fb-procfs crate supports hotplug CPUs now, so migrate back to that for
measuring CPU util.

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 tools/sched_ext/scx_atropos/Cargo.toml  |   1 +
 tools/sched_ext/scx_atropos/src/main.rs | 199 +++++++++++++++---------
 2 files changed, 123 insertions(+), 77 deletions(-)

diff --git a/tools/sched_ext/scx_atropos/Cargo.toml b/tools/sched_ext/scx_atropos/Cargo.toml
index f9890092b8a9c..40a782282b371 100644
--- a/tools/sched_ext/scx_atropos/Cargo.toml
+++ b/tools/sched_ext/scx_atropos/Cargo.toml
@@ -11,6 +11,7 @@ anyhow = "1.0.65"
 bitvec = { version = "1.0", features = ["serde"] }
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
+fb_procfs = "0.7.0"
 hex = "0.4.3"
 libbpf-rs = "0.21.0"
 libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
diff --git a/tools/sched_ext/scx_atropos/src/main.rs b/tools/sched_ext/scx_atropos/src/main.rs
index 3d1fc845c5be8..aebbcd7b0bcee 100644
--- a/tools/sched_ext/scx_atropos/src/main.rs
+++ b/tools/sched_ext/scx_atropos/src/main.rs
@@ -19,6 +19,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 
+use ::fb_procfs as procfs;
 use anyhow::anyhow;
 use anyhow::bail;
 use anyhow::Context;
@@ -157,77 +158,61 @@ fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String {
         .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x))
 }
 
-// Neither procfs or fb_procfs can determine per-CPU utilization reliably
-// with CPU hot[un]plugs. Roll our own.
-//
-// https://github.com/eminence/procfs/issues/274
-// https://github.com/facebookincubator/below/issues/8190
-#[derive(Clone, Debug, Default)]
-struct MyCpuStat {
-    user: u64,
-    nice: u64,
-    system: u64,
-    idle: u64,
-    iowait: u64,
-    irq: u64,
-    softirq: u64,
-    steal: u64,
+fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
+    Ok(reader
+        .read_stat()
+        .context("Failed to read procfs")?
+        .total_cpu
+        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))?)
 }
 
-impl MyCpuStat {
-    fn busy_and_total(&self) -> (u64, u64) {
-        let busy = self.user + self.system + self.nice + self.irq + self.softirq + self.steal;
-        (busy, self.idle + busy + self.iowait)
-    }
-
-    fn calc_util(&self, prev: &MyCpuStat) -> f64 {
-        let (curr_busy, curr_total) = self.busy_and_total();
-        let (prev_busy, prev_total) = prev.busy_and_total();
-        let busy = curr_busy - prev_busy;
-        let total = curr_total - prev_total;
-        if total > 0 {
-            ((busy as f64) / (total as f64)).clamp(0.0, 1.0)
-        } else {
-            1.0
-        }
-    }
-}
-
-#[derive(Clone, Debug, Default)]
-struct MyProcStat {
-    total: MyCpuStat,
-    cpus: BTreeMap<usize, MyCpuStat>,
-}
-
-impl MyProcStat {
-    fn read() -> Result<Self> {
-        let mut result: MyProcStat = Default::default();
-        for line in std::fs::read_to_string("/proc/stat")?.lines() {
-            let mut toks = line.split_whitespace();
-
-            let key = toks.next().ok_or(anyhow!("no key"))?;
-            if !key.starts_with("cpu") {
-                break;
-            }
-
-            let cputime = MyCpuStat {
-                user: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                nice: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                system: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                idle: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                iowait: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                irq: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                softirq: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-                steal: toks.next().ok_or(anyhow!("missing"))?.parse::<u64>()?,
-            };
-
-            if key.len() == 3 {
-                result.total = cputime;
+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
+    match (curr, prev) {
+        (
+            procfs::CpuStat {
+                user_usec: Some(prev_user),
+                nice_usec: Some(prev_nice),
+                system_usec: Some(prev_system),
+                idle_usec: Some(prev_idle),
+                iowait_usec: Some(prev_iowait),
+                irq_usec: Some(prev_irq),
+                softirq_usec: Some(prev_softirq),
+                stolen_usec: Some(prev_stolen),
+                ..
+            },
+            procfs::CpuStat {
+                user_usec: Some(curr_user),
+                nice_usec: Some(curr_nice),
+                system_usec: Some(curr_system),
+                idle_usec: Some(curr_idle),
+                iowait_usec: Some(curr_iowait),
+                irq_usec: Some(curr_irq),
+                softirq_usec: Some(curr_softirq),
+                stolen_usec: Some(curr_stolen),
+                ..
+            },
+        ) => {
+            let idle_usec = curr_idle - prev_idle;
+            let iowait_usec = curr_iowait - prev_iowait;
+            let user_usec = curr_user - prev_user;
+            let system_usec = curr_system - prev_system;
+            let nice_usec = curr_nice - prev_nice;
+            let irq_usec = curr_irq - prev_irq;
+            let softirq_usec = curr_softirq - prev_softirq;
+            let stolen_usec = curr_stolen - prev_stolen;
+
+            let busy_usec =
+                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
+            let total_usec = idle_usec + busy_usec + iowait_usec;
+            if total_usec > 0 {
+                return Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0));
             } else {
-                result.cpus.insert(key[3..].parse::<usize>()?, cputime);
+                return Ok(1.0);
             }
         }
-        Ok(result)
+        _ => {
+            bail!("Missing stats in cpustat");
+        }
     }
 }
 
@@ -394,38 +379,50 @@ struct Tuner {
     top: Arc<Topology>,
     direct_greedy_under: f64,
     kick_greedy_under: f64,
-    prev_cpu_stats: BTreeMap<usize, MyCpuStat>,
+    proc_reader: procfs::ProcReader,
+    prev_cpu_stats: BTreeMap<u32, procfs::CpuStat>,
     dom_utils: Vec<f64>,
 }
 
 impl Tuner {
     fn new(top: Arc<Topology>, opts: &Opts) -> Result<Self> {
+        let proc_reader = procfs::ProcReader::new();
+        let prev_cpu_stats = proc_reader
+            .read_stat()?
+            .cpus_map
+            .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
         Ok(Self {
             direct_greedy_under: opts.direct_greedy_under / 100.0,
             kick_greedy_under: opts.kick_greedy_under / 100.0,
-            prev_cpu_stats: MyProcStat::read()?.cpus,
+            proc_reader,
+            prev_cpu_stats,
             dom_utils: vec![0.0; top.nr_doms],
             top,
         })
     }
 
     fn step(&mut self, skel: &mut AtroposSkel) -> Result<()> {
-        let curr_cpu_stats = MyProcStat::read()?.cpus;
+        let curr_cpu_stats = self
+            .proc_reader
+            .read_stat()?
+            .cpus_map
+            .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
         let ti = &mut skel.bss().tune_input;
         let mut dom_nr_cpus = vec![0; self.top.nr_doms];
         let mut dom_util_sum = vec![0.0; self.top.nr_doms];
 
         for cpu in 0..self.top.nr_cpus {
+            let cpu32 = cpu as u32;
             // None domain indicates the CPU was offline during
-            // initialization and None MyCpuStat indicates the CPU has gone
+            // initialization and None CpuStat indicates the CPU has gone
             // down since then. Ignore both.
             if let (Some(dom), Some(curr), Some(prev)) = (
                 self.top.cpu_dom[cpu],
-                curr_cpu_stats.get(&cpu),
-                self.prev_cpu_stats.get(&cpu),
+                curr_cpu_stats.get(&cpu32),
+                self.prev_cpu_stats.get(&cpu32),
             ) {
                 dom_nr_cpus[dom] += 1;
-                dom_util_sum[dom] += curr.calc_util(prev);
+                dom_util_sum[dom] += calc_util(curr, prev)?;
             }
         }
 
@@ -843,9 +840,10 @@ struct Scheduler<'a> {
     balanced_kworkers: bool,
 
     top: Arc<Topology>,
+    proc_reader: procfs::ProcReader,
 
     prev_at: Instant,
-    prev_total_cpu: MyCpuStat,
+    prev_total_cpu: procfs::CpuStat,
     task_loads: BTreeMap<i32, TaskLoad>,
 
     nr_lb_data_errors: u64,
@@ -914,7 +912,8 @@ impl<'a> Scheduler<'a> {
         info!("Atropos Scheduler Attached");
 
         // Other stuff.
-        let prev_total_cpu = MyProcStat::read()?.total;
+        let proc_reader = procfs::ProcReader::new();
+        let prev_total_cpu = read_total_cpu(&proc_reader)?;
 
         Ok(Self {
             skel,
@@ -927,6 +926,7 @@ impl<'a> Scheduler<'a> {
             balanced_kworkers: opts.balanced_kworkers,
 
             top: top.clone(),
+            proc_reader,
 
             prev_at: Instant::now(),
             prev_total_cpu,
@@ -939,8 +939,53 @@ impl<'a> Scheduler<'a> {
     }
 
     fn get_cpu_busy(&mut self) -> Result<f64> {
-        let total_cpu = MyProcStat::read()?.total;
-        let busy = total_cpu.calc_util(&self.prev_total_cpu);
+        let total_cpu = read_total_cpu(&mut self.proc_reader)?;
+        let busy = match (&self.prev_total_cpu, &total_cpu) {
+            (
+                procfs::CpuStat {
+                    user_usec: Some(prev_user),
+                    nice_usec: Some(prev_nice),
+                    system_usec: Some(prev_system),
+                    idle_usec: Some(prev_idle),
+                    iowait_usec: Some(prev_iowait),
+                    irq_usec: Some(prev_irq),
+                    softirq_usec: Some(prev_softirq),
+                    stolen_usec: Some(prev_stolen),
+                    guest_usec: _,
+                    guest_nice_usec: _,
+                },
+                procfs::CpuStat {
+                    user_usec: Some(curr_user),
+                    nice_usec: Some(curr_nice),
+                    system_usec: Some(curr_system),
+                    idle_usec: Some(curr_idle),
+                    iowait_usec: Some(curr_iowait),
+                    irq_usec: Some(curr_irq),
+                    softirq_usec: Some(curr_softirq),
+                    stolen_usec: Some(curr_stolen),
+                    guest_usec: _,
+                    guest_nice_usec: _,
+                },
+            ) => {
+                let idle_usec = curr_idle - prev_idle;
+                let iowait_usec = curr_iowait - prev_iowait;
+                let user_usec = curr_user - prev_user;
+                let system_usec = curr_system - prev_system;
+                let nice_usec = curr_nice - prev_nice;
+                let irq_usec = curr_irq - prev_irq;
+                let softirq_usec = curr_softirq - prev_softirq;
+                let stolen_usec = curr_stolen - prev_stolen;
+
+                let busy_usec =
+                    user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
+                let total_usec = idle_usec + busy_usec + iowait_usec;
+                busy_usec as f64 / total_usec as f64
+            }
+            _ => {
+                bail!("Some procfs stats are not populated!");
+            }
+        };
+
         self.prev_total_cpu = total_cpu;
         Ok(busy)
     }

From 34e4dd378ee0c4ae5f36d41bec463bb594e81ff4 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 19 Jul 2023 14:01:31 -0500
Subject: [PATCH 086/304] scx: Convert user space schedulers to use
 __{s,u}{32,64} types

When trying to compile the schedulers from source in different contexts,
it can be a pain to use the kernel types. Let's update the schedulers to
use the types exported from UAPI headers, and get rid of the _GNU_SOURCE
macro.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_central.c  |  7 +++----
 tools/sched_ext/scx_flatcg.c   |  4 ++--
 tools/sched_ext/scx_pair.c     | 11 +++++------
 tools/sched_ext/scx_qmap.c     |  1 -
 tools/sched_ext/scx_simple.c   | 13 ++++++-------
 tools/sched_ext/scx_userland.c |  3 +--
 6 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 7481d3c9123a8..05eed8c8ec442 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -4,7 +4,6 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
@@ -36,8 +35,8 @@ int main(int argc, char **argv)
 {
 	struct scx_central *skel;
 	struct bpf_link *link;
-	u64 seq = 0;
-	s32 opt;
+	__u64 seq = 0;
+	__s32 opt;
 
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
@@ -70,7 +69,7 @@ int main(int argc, char **argv)
 	assert(link);
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-		printf("[SEQ %lu]\n", seq++);
+		printf("[SEQ %llu]\n", seq++);
 		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_locals,
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 40aa464c55b1a..072a04979bdd0 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -4,10 +4,10 @@
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  */
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <signal.h>
 #include <unistd.h>
+#include <libgen.h>
 #include <limits.h>
 #include <fcntl.h>
 #include <time.h>
@@ -119,7 +119,7 @@ int main(int argc, char **argv)
 	__u64 last_cpu_sum = 0, last_cpu_idle = 0;
 	__u64 last_stats[FCG_NR_STATS] = {};
 	unsigned long seq = 0;
-	s32 opt;
+	__s32 opt;
 
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 4d24fcedc2cd0..c98db94903045 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -4,7 +4,6 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
@@ -38,8 +37,8 @@ int main(int argc, char **argv)
 {
 	struct scx_pair *skel;
 	struct bpf_link *link;
-	u64 seq = 0;
-	s32 stride, i, opt, outer_fd;
+	__u64 seq = 0;
+	__s32 stride, i, opt, outer_fd;
 
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
@@ -113,13 +112,13 @@ int main(int argc, char **argv)
 
 	printf("Initializing");
         for (i = 0; i < MAX_CGRPS; i++) {
-		s32 inner_fd;
+		__s32 inner_fd;
 
 		if (exit_req)
 			break;
 
 		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
-					  sizeof(u32), MAX_QUEUED, NULL);
+					  sizeof(__u32), MAX_QUEUED, NULL);
 		assert(inner_fd >= 0);
 		assert(!bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY));
 		close(inner_fd);
@@ -137,7 +136,7 @@ int main(int argc, char **argv)
 	assert(link);
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-		printf("[SEQ %lu]\n", seq++);
+		printf("[SEQ %llu]\n", seq++);
 		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_dispatched,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 0a02aa166b478..3d122d34454e8 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -4,7 +4,6 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 4b2f0c16a9d1e..7e32d4fd4aa07 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -4,7 +4,6 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
@@ -32,11 +31,11 @@ static void sigint_handler(int simple)
 	exit_req = 1;
 }
 
-static void read_stats(struct scx_simple *skel, u64 *stats)
+static void read_stats(struct scx_simple *skel, __u64 *stats)
 {
 	int nr_cpus = libbpf_num_possible_cpus();
-	u64 cnts[2][nr_cpus];
-	u32 idx;
+	__u64 cnts[2][nr_cpus];
+	__u32 idx;
 
 	memset(stats, 0, sizeof(stats[0]) * 2);
 
@@ -56,7 +55,7 @@ int main(int argc, char **argv)
 {
 	struct scx_simple *skel;
 	struct bpf_link *link;
-	u32 opt;
+	__u32 opt;
 
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
@@ -86,10 +85,10 @@ int main(int argc, char **argv)
 	assert(link);
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-		u64 stats[2];
+		__u64 stats[2];
 
 		read_stats(skel, stats);
-		printf("local=%lu global=%lu\n", stats[0], stats[1]);
+		printf("local=%llu global=%llu\n", stats[0], stats[1]);
 		fflush(stdout);
 		sleep(1);
 	}
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index a63adae74f21f..2a50a9b459a5b 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -15,7 +15,6 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <unistd.h>
 #include <sched.h>
@@ -101,7 +100,7 @@ static __u32 task_pid(const struct enqueued_task *task)
 	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
 }
 
-static int dispatch_task(s32 pid)
+static int dispatch_task(__s32 pid)
 {
 	int err;
 

From 032ed271ad880bf7cf23640fa02746dfe1a9c9f3 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 19 Jul 2023 15:17:07 -0500
Subject: [PATCH 087/304] scx: Add new SCX_PANIC* macros

The assert() function in assert.h will be compiled out if NDEBUG is
defined. We're currently using asserts on some actual operations, such
as loading a prog. Let's instead add a macro that can be used to panic
in the event of a failure, but which will not be compiled out.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile          | 13 ++++----
 tools/sched_ext/scx_central.c     |  8 ++---
 tools/sched_ext/scx_flatcg.c      | 18 +++--------
 tools/sched_ext/scx_pair.c        | 34 +++++++++----------
 tools/sched_ext/scx_qmap.c        |  8 ++---
 tools/sched_ext/scx_simple.c      |  8 ++---
 tools/sched_ext/scx_user_common.h | 34 +++++++++++++++++++
 tools/sched_ext/scx_userland.c    | 54 ++++++-------------------------
 8 files changed, 82 insertions(+), 95 deletions(-)
 create mode 100644 tools/sched_ext/scx_user_common.h

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 1515ff9cce7f9..278c475310667 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -168,27 +168,28 @@ endif
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $@
 	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $(@:.skel.h=.subskel.h)
 
-scx_simple: scx_simple.c scx_simple.skel.h user_exit_info.h
+scx_simple: scx_simple.c scx_simple.skel.h user_exit_info.h scx_user_common.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_qmap: scx_qmap.c scx_qmap.skel.h user_exit_info.h
+scx_qmap: scx_qmap.c scx_qmap.skel.h user_exit_info.h scx_user_common.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_central: scx_central.c scx_central.skel.h user_exit_info.h
+scx_central: scx_central.c scx_central.skel.h user_exit_info.h scx_user_common.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_pair: scx_pair.c scx_pair.skel.h user_exit_info.h
+scx_pair: scx_pair.c scx_pair.skel.h user_exit_info.h scx_user_common.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_flatcg: scx_flatcg.c scx_flatcg.skel.h user_exit_info.h
+scx_flatcg: scx_flatcg.c scx_flatcg.skel.h user_exit_info.h scx_user_common.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h user_exit_info.h
+scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h user_exit_info.h \
+	      scx_user_common.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 05eed8c8ec442..a303401ffe1a1 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -7,11 +7,11 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
-#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
 #include "scx_central.skel.h"
+#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A central FIFO sched_ext scheduler.\n"
@@ -44,7 +44,7 @@ int main(int argc, char **argv)
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = scx_central__open();
-	assert(skel);
+	SCX_BUG_ON(!skel, "Failed to open skel");
 
 	skel->rodata->central_cpu = 0;
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
@@ -63,10 +63,10 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_central__load(skel));
+	SCX_BUG_ON(scx_central__load(skel), "Failed to load skel");
 
 	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
-	assert(link);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 072a04979bdd0..fbe93083fe641 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -11,11 +11,11 @@
 #include <limits.h>
 #include <fcntl.h>
 #include <time.h>
-#include <assert.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
 #include "scx_flatcg.h"
 #include "scx_flatcg.skel.h"
+#include "scx_user_common.h"
 
 #ifndef FILEID_KERNFS
 #define FILEID_KERNFS		0xfe
@@ -127,10 +127,7 @@ int main(int argc, char **argv)
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = scx_flatcg__open();
-	if (!skel) {
-		fprintf(stderr, "Failed to open: %s\n", strerror(errno));
-		return 1;
-	}
+	SCX_BUG_ON(!skel, "Failed to open skel");
 
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
 
@@ -168,17 +165,10 @@ int main(int argc, char **argv)
 	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
 	       dump_cgrps);
 
-	if (scx_flatcg__load(skel)) {
-		fprintf(stderr, "Failed to load: %s\n", strerror(errno));
-		return 1;
-	}
+	SCX_BUG_ON(scx_flatcg__load(skel), "Failed to load skel");
 
 	link = bpf_map__attach_struct_ops(skel->maps.flatcg_ops);
-	if (!link) {
-		fprintf(stderr, "Failed to attach_struct_ops: %s\n",
-			strerror(errno));
-		return 1;
-	}
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		__u64 acc_stats[FCG_NR_STATS];
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index c98db94903045..c2de48430c5b3 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -7,12 +7,12 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
-#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
 #include "scx_pair.h"
 #include "scx_pair.skel.h"
+#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
@@ -46,7 +46,7 @@ int main(int argc, char **argv)
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = scx_pair__open();
-	assert(skel);
+	SCX_BUG_ON(!skel, "Failed to open skel");
 
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
 
@@ -74,19 +74,13 @@ int main(int argc, char **argv)
 		if (skel->rodata->pair_cpu[i] >= 0)
 			continue;
 
-		if (i == j) {
-			printf("\n");
-			fprintf(stderr, "Invalid stride %d - CPU%d wants to be its own pair\n",
-				stride, i);
-			return 1;
-		}
+		SCX_BUG_ON(i == j,
+			   "Invalid stride %d - CPU%d wants to be its own pair",
+			   stride, i);
 
-		if (skel->rodata->pair_cpu[j] >= 0) {
-			printf("\n");
-			fprintf(stderr, "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair\n",
-				stride, i, j, skel->rodata->pair_cpu[j]);
-			return 1;
-		}
+		SCX_BUG_ON(skel->rodata->pair_cpu[j] >= 0,
+			   "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair",
+			   stride, i, j, skel->rodata->pair_cpu[j]);
 
 		skel->rodata->pair_cpu[i] = j;
 		skel->rodata->pair_cpu[j] = i;
@@ -99,7 +93,7 @@ int main(int argc, char **argv)
 	}
 	printf("\n");
 
-	assert(!scx_pair__load(skel));
+	SCX_BUG_ON(scx_pair__load(skel), "Failed to load skel");
 
 	/*
 	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
@@ -108,7 +102,7 @@ int main(int argc, char **argv)
 	 * populate from BPF.
 	 */
 	outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
-	assert(outer_fd >= 0);
+	SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd);
 
 	printf("Initializing");
         for (i = 0; i < MAX_CGRPS; i++) {
@@ -119,8 +113,10 @@ int main(int argc, char **argv)
 
 		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
 					  sizeof(__u32), MAX_QUEUED, NULL);
-		assert(inner_fd >= 0);
-		assert(!bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY));
+		SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d",
+			   inner_fd);
+		SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY),
+			   "Failed to set inner map");
 		close(inner_fd);
 
 		if (!(i % 10))
@@ -133,7 +129,7 @@ int main(int argc, char **argv)
 	 * Fully initialized, attach and run.
 	 */
 	link = bpf_map__attach_struct_ops(skel->maps.pair_ops);
-	assert(link);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 3d122d34454e8..d275adecdc44f 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -8,11 +8,11 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
-#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
 #include "scx_qmap.skel.h"
+#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A simple five-level FIFO queue sched_ext scheduler.\n"
@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = scx_qmap__open();
-	assert(skel);
+	SCX_BUG_ON(!skel, "Failed to open skel");
 
 	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) {
 		switch (opt) {
@@ -82,10 +82,10 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_qmap__load(skel));
+	SCX_BUG_ON(scx_qmap__load(skel), "Failed to load skel");
 
 	link = bpf_map__attach_struct_ops(skel->maps.qmap_ops);
-	assert(link);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		long nr_enqueued = skel->bss->nr_enqueued;
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 7e32d4fd4aa07..5cca991f57889 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -7,11 +7,11 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
-#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "user_exit_info.h"
 #include "scx_simple.skel.h"
+#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A simple sched_ext scheduler.\n"
@@ -63,7 +63,7 @@ int main(int argc, char **argv)
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = scx_simple__open();
-	assert(skel);
+	SCX_BUG_ON(!skel, "Failed to open skel");
 
 	while ((opt = getopt(argc, argv, "fph")) != -1) {
 		switch (opt) {
@@ -79,10 +79,10 @@ int main(int argc, char **argv)
 		}
 	}
 
-	assert(!scx_simple__load(skel));
+	SCX_BUG_ON(scx_simple__load(skel), "Failed to load skel");
 
 	link = bpf_map__attach_struct_ops(skel->maps.simple_ops);
-	assert(link);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		__u64 stats[2];
diff --git a/tools/sched_ext/scx_user_common.h b/tools/sched_ext/scx_user_common.h
new file mode 100644
index 0000000000000..76a0d12eba28c
--- /dev/null
+++ b/tools/sched_ext/scx_user_common.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_USER_COMMON_H
+#define __SCHED_EXT_USER_COMMON_H
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __KERNEL__
+#error "Should not be included by BPF programs"
+#endif
+
+#define SCX_BUG(__fmt, ...)							\
+	do {									\
+		fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__,	\
+			strerror(errno));					\
+		fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__);		\
+		fprintf(stderr, "\n");						\
+										\
+		exit(EXIT_FAILURE);						\
+	} while (0)
+
+#define SCX_BUG_ON(__cond, __fmt, ...)					\
+	do {								\
+		if (__cond)						\
+			SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__);	\
+	} while (0)
+
+#endif	/* __SCHED_EXT_USER_COMMON_H */
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index 2a50a9b459a5b..d393237818861 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -30,6 +30,7 @@
 #include "user_exit_info.h"
 #include "scx_userland.h"
 #include "scx_userland.skel.h"
+#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A minimal userland sched_ext scheduler.\n"
@@ -263,7 +264,7 @@ static int spawn_stats_thread(void)
 	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
 }
 
-static int bootstrap(int argc, char **argv)
+static void bootstrap(int argc, char **argv)
 {
 	int err;
 	__u32 opt;
@@ -284,10 +285,7 @@ static int bootstrap(int argc, char **argv)
 	 * needs to be scheduled.
 	 */
 	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
-	if (err) {
-		fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err));
-		return err;
-	}
+	SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT");
 
 	while ((opt = getopt(argc, argv, "b:ph")) != -1) {
 		switch (opt) {
@@ -309,53 +307,28 @@ static int bootstrap(int argc, char **argv)
 	 * to allocate.
 	 */
 	err = mlockall(MCL_CURRENT | MCL_FUTURE);
-	if (err) {
-		fprintf(stderr, "Failed to prefault and lock address space: %s\n",
-			strerror(err));
-		return err;
-	}
+	SCX_BUG_ON(err, "Failed to prefault and lock address space");
 
 	skel = scx_userland__open();
-	if (!skel) {
-		fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno));
-		return errno;
-	}
+	SCX_BUG_ON(!skel, "Failed to open skel");
+
 	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
 	assert(skel->rodata->num_possible_cpus > 0);
 	skel->rodata->usersched_pid = getpid();
 	assert(skel->rodata->usersched_pid > 0);
 	skel->rodata->switch_partial = switch_partial;
 
-	err = scx_userland__load(skel);
-	if (err) {
-		fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err));
-		goto destroy_skel;
-	}
+	SCX_BUG_ON(scx_userland__load(skel), "Failed to load skel");
 
 	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
 	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
 	assert(enqueued_fd > 0);
 	assert(dispatched_fd > 0);
 
-	err = spawn_stats_thread();
-	if (err) {
-		fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err));
-		goto destroy_skel;
-	}
+	SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread");
 
 	ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops);
-	if (!ops_link) {
-		fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno));
-		err = errno;
-		goto destroy_skel;
-	}
-
-	return 0;
-
-destroy_skel:
-	scx_userland__destroy(skel);
-	exit_req = 1;
-	return err;
+	SCX_BUG_ON(!ops_link, "Failed to attach struct_ops");
 }
 
 static void sched_main_loop(void)
@@ -383,14 +356,7 @@ static void sched_main_loop(void)
 
 int main(int argc, char **argv)
 {
-	int err;
-
-	err = bootstrap(argc, argv);
-	if (err) {
-		fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err));
-		return err;
-	}
-
+	bootstrap(argc, argv);
 	sched_main_loop();
 
 	exit_req = 1;

From d4ae6a36d62c5691c08aa4c005e887914ef02faf Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 19 Jul 2023 19:21:56 -0500
Subject: [PATCH 088/304] scx: Fix some clippy warnings in Atropos

If you run clippy on atropos, it notices a few places where can clean up
the rust code. Let's do that.

Note that we're skipping this suggestion, as it harms readability:

cargo clippy --manifest-path=scx_atropos/Cargo.toml --release
    Checking scx_atropos v0.5.0 (/home/void/upstream/sched_ext/tools/sched_ext/scx_atropos)
warning: manual implementation of `Option::map`
   --> src/main.rs:662:9
    |
662 | /         match tasks_by_load
663 | |             .into_iter()
664 | |             .skip_while(|(_, task)| {
665 | |                 task.migrated.get()
...   |
672 | |             None => None,
673 | |         }
    | |_________^
    |
    = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_map
    = note: `#[warn(clippy::manual_map)]` on by default
help: try this
    |
662 ~         tasks_by_load
663 +             .into_iter()
664 +             .skip_while(|(_, task)| {
665 +                 task.migrated.get()
666 +                     || (task.dom_mask & (1 << pull_dom) == 0)
667 +                     || (skip_kworkers && task.is_kworker)
668 +             })
669 +             .next().map(|(OrderedFloat(load), task)| (*load, task))
    |

warning: called `skip_while(<p>).next()` on an `Iterator`
   --> src/main.rs:662:15
    |
662 |           match tasks_by_load
    |  _______________^
663 | |             .into_iter()
664 | |             .skip_while(|(_, task)| {
665 | |                 task.migrated.get()
...   |
668 | |             })
669 | |             .next()
    | |___________________^
    |
    = help: this is more succinctly expressed by calling `.find(!<p>)` instead
    = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#skip_while_next
    = note: `#[warn(clippy::skip_while_next)]` on by default

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_atropos/build.rs    |  2 +-
 tools/sched_ext/scx_atropos/src/main.rs | 34 ++++++++++++-------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tools/sched_ext/scx_atropos/build.rs b/tools/sched_ext/scx_atropos/build.rs
index 26e792c5e17e9..bb56928ff0b59 100644
--- a/tools/sched_ext/scx_atropos/build.rs
+++ b/tools/sched_ext/scx_atropos/build.rs
@@ -50,7 +50,7 @@ fn gen_bpf_sched(name: &str) {
         .source(src.clone())
         .clang(clang)
         .clang_args(bpf_cflags)
-        .build_and_generate(&skel)
+        .build_and_generate(skel)
         .unwrap();
     println!("cargo:rerun-if-changed={}", src);
 }
diff --git a/tools/sched_ext/scx_atropos/src/main.rs b/tools/sched_ext/scx_atropos/src/main.rs
index aebbcd7b0bcee..a411b6eeb9f1f 100644
--- a/tools/sched_ext/scx_atropos/src/main.rs
+++ b/tools/sched_ext/scx_atropos/src/main.rs
@@ -159,11 +159,11 @@ fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String {
 }
 
 fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
-    Ok(reader
+    reader
         .read_stat()
         .context("Failed to read procfs")?
         .total_cpu
-        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))?)
+        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
 }
 
 fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
@@ -205,9 +205,9 @@ fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
                 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
             let total_usec = idle_usec + busy_usec + iowait_usec;
             if total_usec > 0 {
-                return Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0));
+                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
             } else {
-                return Ok(1.0);
+                Ok(1.0)
             }
         }
         _ => {
@@ -319,8 +319,8 @@ impl Topology {
             };
 
             cpu_to_cache.push(id);
-            if id.is_some() {
-                cache_ids.insert(id.unwrap());
+            if let Some(id) = id {
+                cache_ids.insert(id);
             }
         }
 
@@ -349,13 +349,13 @@ impl Topology {
 
         // Build and return dom -> cpumask and cpu -> dom mappings.
         let mut dom_cpus =
-            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; nr_doms as usize];
+            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; nr_doms];
         let mut cpu_dom = vec![];
 
-        for cpu in 0..nr_cpus {
-            match cpu_to_cache[cpu] {
+        for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) {
+            match cache {
                 Some(cache_id) => {
-                    let dom_id = cache_to_dom[&cache_id];
+                    let dom_id = cache_to_dom[cache_id];
                     dom_cpus[dom_id].set(cpu, true);
                     cpu_dom.push(Some(dom_id));
                 }
@@ -868,7 +868,7 @@ impl<'a> Scheduler<'a> {
         }
 
         // Initialize skel according to @opts.
-        let top = Arc::new(if opts.cpumasks.len() > 0 {
+        let top = Arc::new(if !opts.cpumasks.is_empty() {
             Topology::from_cpumasks(&opts.cpumasks, nr_cpus)?
         } else {
             Topology::from_cache_level(opts.cache_level, nr_cpus)?
@@ -939,7 +939,7 @@ impl<'a> Scheduler<'a> {
     }
 
     fn get_cpu_busy(&mut self) -> Result<f64> {
-        let total_cpu = read_total_cpu(&mut self.proc_reader)?;
+        let total_cpu = read_total_cpu(&self.proc_reader)?;
         let busy = match (&self.prev_total_cpu, &total_cpu) {
             (
                 procfs::CpuStat {
@@ -998,7 +998,7 @@ impl<'a> Scheduler<'a> {
 
         for stat in 0..atropos_sys::stat_idx_ATROPOS_NR_STATS {
             let cpu_stat_vec = stats_map
-                .lookup_percpu(&(stat as u32).to_ne_bytes(), libbpf_rs::MapFlags::ANY)
+                .lookup_percpu(&stat.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
                 .with_context(|| format!("Failed to lookup stat {}", stat))?
                 .expect("per-cpu stat should exist");
             let sum = cpu_stat_vec
@@ -1013,7 +1013,7 @@ impl<'a> Scheduler<'a> {
                 .sum();
             stats_map
                 .update_percpu(
-                    &(stat as u32).to_ne_bytes(),
+                    &stat.to_ne_bytes(),
                     &zero_vec,
                     libbpf_rs::MapFlags::ANY,
                 )
@@ -1025,12 +1025,12 @@ impl<'a> Scheduler<'a> {
 
     fn report(
         &mut self,
-        stats: &Vec<u64>,
+        stats: &[u64],
         cpu_busy: f64,
         processing_dur: Duration,
         load_avg: f64,
-        dom_loads: &Vec<f64>,
-        imbal: &Vec<f64>,
+        dom_loads: &[f64],
+        imbal: &[f64],
     ) {
         let stat = |idx| stats[idx as usize];
         let total = stat(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC)

From 29ccb3a6e35fd62655a2ae62e97e9587963696ed Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 20 Jul 2023 13:27:04 -0500
Subject: [PATCH 089/304] scx: Rename scx_atropos to scx_rusty

scx_rusty is a more descriptive name of the scheduler, so let's rename
it to that.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile                      |  16 +--
 .../{scx_atropos => scx_rusty}/.gitignore     |   0
 .../{scx_atropos => scx_rusty}/Cargo.toml     |   2 +-
 .../{scx_atropos => scx_rusty}/build.rs       |  14 +--
 .../{scx_atropos => scx_rusty}/rustfmt.toml   |   0
 .../src/bpf/rusty.bpf.c}                      | 110 +++++++++---------
 .../atropos.h => scx_rusty/src/bpf/rusty.h}   |  34 +++---
 .../{scx_atropos => scx_rusty}/src/main.rs    |  96 +++++++--------
 .../src/rusty_sys.rs}                         |   2 +-
 9 files changed, 137 insertions(+), 137 deletions(-)
 rename tools/sched_ext/{scx_atropos => scx_rusty}/.gitignore (100%)
 rename tools/sched_ext/{scx_atropos => scx_rusty}/Cargo.toml (97%)
 rename tools/sched_ext/{scx_atropos => scx_rusty}/build.rs (88%)
 rename tools/sched_ext/{scx_atropos => scx_rusty}/rustfmt.toml (100%)
 rename tools/sched_ext/{scx_atropos/src/bpf/atropos.bpf.c => scx_rusty/src/bpf/rusty.bpf.c} (89%)
 rename tools/sched_ext/{scx_atropos/src/bpf/atropos.h => scx_rusty/src/bpf/rusty.h} (70%)
 rename tools/sched_ext/{scx_atropos => scx_rusty}/src/main.rs (93%)
 rename tools/sched_ext/{scx_atropos/src/atropos_sys.rs => scx_rusty/src/rusty_sys.rs} (83%)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 278c475310667..8ad8e186aefa9 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -117,7 +117,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_atropos
+all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_rusty
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -193,19 +193,19 @@ scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
-scx_atropos: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
-scx_atropos: export ATROPOS_CLANG = $(CLANG)
-scx_atropos: export ATROPOS_BPF_CFLAGS = $(BPF_CFLAGS)
-scx_atropos: $(INCLUDE_DIR)/vmlinux.h
-	cargo build --manifest-path=scx_atropos/Cargo.toml $(CARGOFLAGS)
+scx_rusty: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
+scx_rusty: export SCX_RUSTY_CLANG = $(CLANG)
+scx_rusty: export SCX_RUSTY_BPF_CFLAGS = $(BPF_CFLAGS)
+scx_rusty: $(INCLUDE_DIR)/vmlinux.h
+	cargo build --manifest-path=scx_rusty/Cargo.toml $(CARGOFLAGS)
 
 clean:
-	cargo clean --manifest-path=scx_atropos/Cargo.toml
+	cargo clean --manifest-path=scx_rusty/Cargo.toml
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
 
-.PHONY: all scx_atropos clean
+.PHONY: all scx_rusty clean
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/scx_atropos/.gitignore b/tools/sched_ext/scx_rusty/.gitignore
similarity index 100%
rename from tools/sched_ext/scx_atropos/.gitignore
rename to tools/sched_ext/scx_rusty/.gitignore
diff --git a/tools/sched_ext/scx_atropos/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
similarity index 97%
rename from tools/sched_ext/scx_atropos/Cargo.toml
rename to tools/sched_ext/scx_rusty/Cargo.toml
index 40a782282b371..b0edd3b937d41 100644
--- a/tools/sched_ext/scx_atropos/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "scx_atropos"
+name = "scx_rusty"
 version = "0.5.0"
 authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
 edition = "2021"
diff --git a/tools/sched_ext/scx_atropos/build.rs b/tools/sched_ext/scx_rusty/build.rs
similarity index 88%
rename from tools/sched_ext/scx_atropos/build.rs
rename to tools/sched_ext/scx_rusty/build.rs
index bb56928ff0b59..d47a754514ada 100644
--- a/tools/sched_ext/scx_atropos/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -11,9 +11,9 @@ use std::path::PathBuf;
 
 use libbpf_cargo::SkeletonBuilder;
 
-const HEADER_PATH: &str = "src/bpf/atropos.h";
+const HEADER_PATH: &str = "src/bpf/rusty.h";
 
-fn bindgen_atropos() {
+fn bindgen_rusty() {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
     println!("cargo:rerun-if-changed={}", HEADER_PATH);
 
@@ -35,13 +35,13 @@ fn bindgen_atropos() {
     // Write the bindings to the $OUT_DIR/bindings.rs file.
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
     bindings
-        .write_to_file(out_path.join("atropos-sys.rs"))
+        .write_to_file(out_path.join("rusty_sys.rs"))
         .expect("Couldn't write bindings!");
 }
 
 fn gen_bpf_sched(name: &str) {
-    let bpf_cflags = env::var("ATROPOS_BPF_CFLAGS").unwrap();
-    let clang = env::var("ATROPOS_CLANG").unwrap();
+    let bpf_cflags = env::var("SCX_RUSTY_BPF_CFLAGS").unwrap();
+    let clang = env::var("SCX_RUSTY_CLANG").unwrap();
     eprintln!("{}", clang);
     let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
     let skel = Path::new(&outpath);
@@ -56,7 +56,7 @@ fn gen_bpf_sched(name: &str) {
 }
 
 fn main() {
-    bindgen_atropos();
+    bindgen_rusty();
     // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
     // Reasons are because the generated skeleton contains compiler attributes
     // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
@@ -66,5 +66,5 @@ fn main() {
     // However, there is hope! When the above feature stabilizes we can clean this
     // all up.
     create_dir_all("./src/bpf/.output").unwrap();
-    gen_bpf_sched("atropos");
+    gen_bpf_sched("rusty");
 }
diff --git a/tools/sched_ext/scx_atropos/rustfmt.toml b/tools/sched_ext/scx_rusty/rustfmt.toml
similarity index 100%
rename from tools/sched_ext/scx_atropos/rustfmt.toml
rename to tools/sched_ext/scx_rusty/rustfmt.toml
diff --git a/tools/sched_ext/scx_atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
similarity index 89%
rename from tools/sched_ext/scx_atropos/src/bpf/atropos.bpf.c
rename to tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 118fe728e886d..2c68b659cacd1 100644
--- a/tools/sched_ext/scx_atropos/src/bpf/atropos.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -3,16 +3,16 @@
  * This software may be used and distributed according to the terms of the
  * GNU General Public License version 2.
  *
- * Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
+ * scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
  * part does simple round robin in each domain and the userspace part
  * calculates the load factor of each domain and tells the BPF part how to load
  * balance the domains.
  *
  * Every task has an entry in the task_data map which lists which domain the
- * task belongs to. When a task first enters the system (atropos_prep_enable),
+ * task belongs to. When a task first enters the system (rusty_prep_enable),
  * they are round-robined to a domain.
  *
- * atropos_select_cpu is the primary scheduling logic, invoked when a task
+ * rusty_select_cpu is the primary scheduling logic, invoked when a task
  * becomes runnable. The lb_data map is populated by userspace to inform the BPF
  * scheduler that a task should be migrated to a new domain. Otherwise, the task
  * is scheduled in priority order as follows:
@@ -23,11 +23,11 @@
  * * Any idle cpu in the domain
  *
  * If none of the above conditions are met, then the task is enqueued to a
- * dispatch queue corresponding to the domain (atropos_enqueue).
+ * dispatch queue corresponding to the domain (rusty_enqueue).
  *
- * atropos_dispatch will attempt to consume a task from its domain's
+ * rusty_dispatch will attempt to consume a task from its domain's
  * corresponding dispatch queue (this occurs after scheduling any tasks directly
- * assigned to it due to the logic in atropos_select_cpu). If no task is found,
+ * assigned to it due to the logic in rusty_select_cpu). If no task is found,
  * then greedy load stealing will attempt to find a task on another dispatch
  * queue to run.
  *
@@ -36,7 +36,7 @@
  * load balance based on userspace populating the lb_data map.
  */
 #include "../../../scx_common.bpf.h"
-#include "atropos.h"
+#include "rusty.h"
 
 #include <errno.h>
 #include <stdbool.h>
@@ -110,7 +110,7 @@ struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(u32));
 	__uint(value_size, sizeof(u64));
-	__uint(max_entries, ATROPOS_NR_STATS);
+	__uint(max_entries, RUSTY_NR_STATS);
 } stats SEC(".maps");
 
 static inline void stat_add(enum stat_idx idx, u64 addend)
@@ -270,7 +270,7 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 	return task_ctx->dom_id == new_dom_id;
 }
 
-s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
+s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
 	struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
@@ -289,7 +289,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (kthreads_local &&
 	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
 		cpu = prev_cpu;
-		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
 		goto direct;
 	}
 
@@ -330,7 +330,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			if (has_idle) {
 				cpu = bpf_get_smp_processor_id();
 				if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
-					stat_add(ATROPOS_STAT_WAKE_SYNC, 1);
+					stat_add(RUSTY_STAT_WAKE_SYNC, 1);
 					goto direct;
 				}
 			}
@@ -339,7 +339,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 	/* If only one CPU is allowed, dispatch */
 	if (p->nr_cpus_allowed == 1) {
-		stat_add(ATROPOS_STAT_PINNED, 1);
+		stat_add(RUSTY_STAT_PINNED, 1);
 		cpu = prev_cpu;
 		goto direct;
 	}
@@ -358,7 +358,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (prev_domestic) {
 		if (bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
 		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-			stat_add(ATROPOS_STAT_PREV_IDLE, 1);
+			stat_add(RUSTY_STAT_PREV_IDLE, 1);
 			cpu = prev_cpu;
 			goto direct;
 		}
@@ -373,7 +373,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 					 direct_greedy_cpumask) &&
 		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
 		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-			stat_add(ATROPOS_STAT_GREEDY_IDLE, 1);
+			stat_add(RUSTY_STAT_GREEDY_IDLE, 1);
 			cpu = prev_cpu;
 			goto direct;
 		}
@@ -390,7 +390,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask,
 					    SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
-			stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+			stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
 			goto direct;
 		}
 	}
@@ -400,7 +400,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	 * isn't, picking @prev_cpu may improve L1/2 locality.
 	 */
 	if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
 		cpu = prev_cpu;
 		goto direct;
 	}
@@ -408,7 +408,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	/* If there is any domestic idle CPU, dispatch directly */
 	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
 	if (cpu >= 0) {
-		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
 		goto direct;
 	}
 
@@ -434,7 +434,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 							    domc->direct_greedy_cpumask,
 							    SCX_PICK_IDLE_CORE);
 				if (cpu >= 0) {
-					stat_add(ATROPOS_STAT_DIRECT_GREEDY, 1);
+					stat_add(RUSTY_STAT_DIRECT_GREEDY, 1);
 					goto direct;
 				}
 			}
@@ -444,7 +444,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 							    direct_greedy_cpumask,
 							    SCX_PICK_IDLE_CORE);
 				if (cpu >= 0) {
-					stat_add(ATROPOS_STAT_DIRECT_GREEDY_FAR, 1);
+					stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
 					goto direct;
 				}
 			}
@@ -457,7 +457,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
 						    domc->direct_greedy_cpumask, 0);
 			if (cpu >= 0) {
-				stat_add(ATROPOS_STAT_DIRECT_GREEDY, 1);
+				stat_add(RUSTY_STAT_DIRECT_GREEDY, 1);
 				goto direct;
 			}
 		}
@@ -466,7 +466,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
 						    direct_greedy_cpumask, 0);
 			if (cpu >= 0) {
-				stat_add(ATROPOS_STAT_DIRECT_GREEDY_FAR, 1);
+				stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
 				goto direct;
 			}
 		}
@@ -496,7 +496,7 @@ s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, s32 prev_cpu,
 	return -ENOENT;
 }
 
-void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
+void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct task_ctx *task_ctx;
 	struct bpf_cpumask *p_cpumask;
@@ -516,7 +516,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
 	if (new_dom && *new_dom != task_ctx->dom_id &&
 	    task_set_domain(task_ctx, p, *new_dom, false)) {
-		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
+		stat_add(RUSTY_STAT_LOAD_BALANCE, 1);
 		task_ctx->dispatch_local = false;
 		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
 		if (cpu >= 0)
@@ -541,7 +541,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 	if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) {
 		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
 		scx_bpf_kick_cpu(cpu, 0);
-		stat_add(ATROPOS_STAT_REPATRIATE, 1);
+		stat_add(RUSTY_STAT_REPATRIATE, 1);
 	}
 
 dom_queue:
@@ -589,7 +589,7 @@ void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u64 enq_flags)
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
 					    kick_greedy_cpumask, 0);
 		if (cpu >= 0) {
-			stat_add(ATROPOS_STAT_KICK_GREEDY, 1);
+			stat_add(RUSTY_STAT_KICK_GREEDY, 1);
 			scx_bpf_kick_cpu(cpu, 0);
 		}
 	}
@@ -628,12 +628,12 @@ static u32 dom_rr_next(s32 cpu)
 	return dom_id;
 }
 
-void BPF_STRUCT_OPS(atropos_dispatch, s32 cpu, struct task_struct *prev)
+void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
 {
 	u32 dom = cpu_to_dom_id(cpu);
 
 	if (scx_bpf_consume(dom)) {
-		stat_add(ATROPOS_STAT_DSQ_DISPATCH, 1);
+		stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
 		return;
 	}
 
@@ -645,13 +645,13 @@ void BPF_STRUCT_OPS(atropos_dispatch, s32 cpu, struct task_struct *prev)
 
 		if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
 		    scx_bpf_consume(dom_id)) {
-			stat_add(ATROPOS_STAT_GREEDY, 1);
+			stat_add(RUSTY_STAT_GREEDY, 1);
 			break;
 		}
 	}
 }
 
-void BPF_STRUCT_OPS(atropos_runnable, struct task_struct *p, u64 enq_flags)
+void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 {
 	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
@@ -665,7 +665,7 @@ void BPF_STRUCT_OPS(atropos_runnable, struct task_struct *p, u64 enq_flags)
 	task_ctx->is_kworker = p->flags & PF_WQ_WORKER;
 }
 
-void BPF_STRUCT_OPS(atropos_running, struct task_struct *p)
+void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 {
 	struct task_ctx *taskc;
 	struct dom_ctx *domc;
@@ -698,7 +698,7 @@ void BPF_STRUCT_OPS(atropos_running, struct task_struct *p)
 		domc->vtime_now = p->scx.dsq_vtime;
 }
 
-void BPF_STRUCT_OPS(atropos_stopping, struct task_struct *p, bool runnable)
+void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 {
 	if (fifo_sched)
 		return;
@@ -707,7 +707,7 @@ void BPF_STRUCT_OPS(atropos_stopping, struct task_struct *p, bool runnable)
 	p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
 }
 
-void BPF_STRUCT_OPS(atropos_quiescent, struct task_struct *p, u64 deq_flags)
+void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 {
 	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
@@ -721,7 +721,7 @@ void BPF_STRUCT_OPS(atropos_quiescent, struct task_struct *p, u64 deq_flags)
 	task_ctx->runnable_at = 0;
 }
 
-void BPF_STRUCT_OPS(atropos_set_weight, struct task_struct *p, u32 weight)
+void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
 {
 	struct task_ctx *task_ctx;
 	pid_t pid = p->pid;
@@ -777,7 +777,7 @@ static void task_pick_and_set_domain(struct task_ctx *task_ctx,
 			      dom_id, p->comm, p->pid);
 }
 
-void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
+void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
 		    const struct cpumask *cpumask)
 {
 	struct task_ctx *task_ctx;
@@ -793,7 +793,7 @@ void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
 	task_ctx->all_cpus = bpf_cpumask_full(cpumask);
 }
 
-s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
+s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
 	struct bpf_cpumask *cpumask;
@@ -806,7 +806,7 @@ s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
 	pid = p->pid;
 	ret = bpf_map_update_elem(&task_data, &pid, &task_ctx, BPF_NOEXIST);
 	if (ret) {
-		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return ret;
 	}
 
@@ -838,12 +838,12 @@ s32 BPF_STRUCT_OPS(atropos_prep_enable, struct task_struct *p,
 	return 0;
 }
 
-void BPF_STRUCT_OPS(atropos_disable, struct task_struct *p)
+void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p)
 {
 	pid_t pid = p->pid;
 	long ret = bpf_map_delete_elem(&task_data, &pid);
 	if (ret) {
-		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return;
 	}
 }
@@ -919,7 +919,7 @@ static s32 create_dom(u32 dom_id)
 	return 0;
 }
 
-s32 BPF_STRUCT_OPS_SLEEPABLE(atropos_init)
+s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
 {
 	struct bpf_cpumask *cpumask;
 	s32 i, ret;
@@ -953,26 +953,26 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(atropos_init)
 	return 0;
 }
 
-void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
+void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei)
 {
 	bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg);
 	exit_type = ei->type;
 }
 
 SEC(".struct_ops.link")
-struct sched_ext_ops atropos = {
-	.select_cpu		= (void *)atropos_select_cpu,
-	.enqueue		= (void *)atropos_enqueue,
-	.dispatch		= (void *)atropos_dispatch,
-	.runnable		= (void *)atropos_runnable,
-	.running		= (void *)atropos_running,
-	.stopping		= (void *)atropos_stopping,
-	.quiescent		= (void *)atropos_quiescent,
-	.set_weight		= (void *)atropos_set_weight,
-	.set_cpumask		= (void *)atropos_set_cpumask,
-	.prep_enable		= (void *)atropos_prep_enable,
-	.disable		= (void *)atropos_disable,
-	.init			= (void *)atropos_init,
-	.exit			= (void *)atropos_exit,
-	.name			= "atropos",
+struct sched_ext_ops rusty = {
+	.select_cpu		= (void *)rusty_select_cpu,
+	.enqueue		= (void *)rusty_enqueue,
+	.dispatch		= (void *)rusty_dispatch,
+	.runnable		= (void *)rusty_runnable,
+	.running		= (void *)rusty_running,
+	.stopping		= (void *)rusty_stopping,
+	.quiescent		= (void *)rusty_quiescent,
+	.set_weight		= (void *)rusty_set_weight,
+	.set_cpumask		= (void *)rusty_set_cpumask,
+	.prep_enable		= (void *)rusty_prep_enable,
+	.disable		= (void *)rusty_disable,
+	.init			= (void *)rusty_init,
+	.exit			= (void *)rusty_exit,
+	.name			= "rusty",
 };
diff --git a/tools/sched_ext/scx_atropos/src/bpf/atropos.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
similarity index 70%
rename from tools/sched_ext/scx_atropos/src/bpf/atropos.h
rename to tools/sched_ext/scx_rusty/src/bpf/rusty.h
index 894782e32fa1e..28eed277fd8af 100644
--- a/tools/sched_ext/scx_atropos/src/bpf/atropos.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -2,8 +2,8 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-#ifndef __ATROPOS_H
-#define __ATROPOS_H
+#ifndef __RUSTY_H
+#define __RUSTY_H
 
 #include <stdbool.h>
 #ifndef __kptr
@@ -20,25 +20,25 @@
 /* Statistics */
 enum stat_idx {
 	/* The following fields add up to all dispatched tasks */
-	ATROPOS_STAT_WAKE_SYNC,
-	ATROPOS_STAT_PREV_IDLE,
-	ATROPOS_STAT_GREEDY_IDLE,
-	ATROPOS_STAT_PINNED,
-	ATROPOS_STAT_DIRECT_DISPATCH,
-	ATROPOS_STAT_DIRECT_GREEDY,
-	ATROPOS_STAT_DIRECT_GREEDY_FAR,
-	ATROPOS_STAT_DSQ_DISPATCH,
-	ATROPOS_STAT_GREEDY,
+	RUSTY_STAT_WAKE_SYNC,
+	RUSTY_STAT_PREV_IDLE,
+	RUSTY_STAT_GREEDY_IDLE,
+	RUSTY_STAT_PINNED,
+	RUSTY_STAT_DIRECT_DISPATCH,
+	RUSTY_STAT_DIRECT_GREEDY,
+	RUSTY_STAT_DIRECT_GREEDY_FAR,
+	RUSTY_STAT_DSQ_DISPATCH,
+	RUSTY_STAT_GREEDY,
 
 	/* Extra stats that don't contribute to total */
-	ATROPOS_STAT_REPATRIATE,
-	ATROPOS_STAT_KICK_GREEDY,
-	ATROPOS_STAT_LOAD_BALANCE,
+	RUSTY_STAT_REPATRIATE,
+	RUSTY_STAT_KICK_GREEDY,
+	RUSTY_STAT_LOAD_BALANCE,
 
 	/* Errors */
-	ATROPOS_STAT_TASK_GET_ERR,
+	RUSTY_STAT_TASK_GET_ERR,
 
-	ATROPOS_NR_STATS,
+	RUSTY_NR_STATS,
 };
 
 struct task_ctx {
@@ -61,4 +61,4 @@ struct task_ctx {
 	bool dispatch_local;
 };
 
-#endif /* __ATROPOS_H */
+#endif /* __RUSTY_H */
diff --git a/tools/sched_ext/scx_atropos/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
similarity index 93%
rename from tools/sched_ext/scx_atropos/src/main.rs
rename to tools/sched_ext/scx_rusty/src/main.rs
index a411b6eeb9f1f..4ff57913aa1d7 100644
--- a/tools/sched_ext/scx_atropos/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -2,10 +2,10 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-#[path = "bpf/.output/atropos.skel.rs"]
-mod atropos;
-pub use atropos::*;
-pub mod atropos_sys;
+#[path = "bpf/.output/rusty.skel.rs"]
+mod rusty;
+pub use rusty::*;
+pub mod rusty_sys;
 
 use std::cell::Cell;
 use std::collections::BTreeMap;
@@ -34,7 +34,7 @@ use log::trace;
 use log::warn;
 use ordered_float::OrderedFloat;
 
-/// Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
+/// scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
 /// part does simple round robin in each domain and the userspace part
 /// calculates the load factor of each domain and tells the BPF part how to load
 /// balance the domains.
@@ -45,9 +45,9 @@ use ordered_float::OrderedFloat;
 /// chiplet in a six-chiplet AMD processor, and could match the performance of
 /// production setup using CFS.
 ///
-/// WARNING: Atropos currently assumes that all domains have equal
-/// processing power and at similar distances from each other. This
-/// limitation will be removed in the future.
+/// WARNING: scx_rusty currently assumes that all domains have equal processing
+/// power and at similar distances from each other. This limitation will be
+/// removed in the future.
 #[derive(Debug, Parser)]
 struct Opts {
     /// Scheduling slice duration in microseconds.
@@ -226,16 +226,16 @@ struct Topology {
 
 impl Topology {
     fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result<Self> {
-        if cpumasks.len() > atropos_sys::MAX_DOMS as usize {
+        if cpumasks.len() > rusty_sys::MAX_DOMS as usize {
             bail!(
                 "Number of requested domains ({}) is greater than MAX_DOMS ({})",
                 cpumasks.len(),
-                atropos_sys::MAX_DOMS
+                rusty_sys::MAX_DOMS
             );
         }
         let mut cpu_dom = vec![None; nr_cpus];
         let mut dom_cpus =
-            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
+            vec![bitvec![u64, Lsb0; 0; rusty_sys::MAX_CPUS as usize]; cpumasks.len()];
         for (dom, cpumask) in cpumasks.iter().enumerate() {
             let hex_str = {
                 let mut tmp_str = cpumask
@@ -339,17 +339,17 @@ impl Topology {
             nr_doms += 1;
         }
 
-        if nr_doms > atropos_sys::MAX_DOMS as usize {
+        if nr_doms > rusty_sys::MAX_DOMS as usize {
             bail!(
                 "Total number of doms {} is greater than MAX_DOMS ({})",
                 nr_doms,
-                atropos_sys::MAX_DOMS
+                rusty_sys::MAX_DOMS
             );
         }
 
         // Build and return dom -> cpumask and cpu -> dom mappings.
         let mut dom_cpus =
-            vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; nr_doms];
+            vec![bitvec![u64, Lsb0; 0; rusty_sys::MAX_CPUS as usize]; nr_doms];
         let mut cpu_dom = vec![];
 
         for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) {
@@ -401,7 +401,7 @@ impl Tuner {
         })
     }
 
-    fn step(&mut self, skel: &mut AtroposSkel) -> Result<()> {
+    fn step(&mut self, skel: &mut RustySkel) -> Result<()> {
         let curr_cpu_stats = self
             .proc_reader
             .read_stat()?
@@ -483,7 +483,7 @@ struct TaskInfo {
 }
 
 struct LoadBalancer<'a, 'b, 'c> {
-    maps: AtroposMapsMut<'a>,
+    maps: RustyMapsMut<'a>,
     top: Arc<Topology>,
     task_loads: &'b mut BTreeMap<i32, TaskLoad>,
     load_decay_factor: f64,
@@ -520,7 +520,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50;
 
     fn new(
-        maps: AtroposMapsMut<'a>,
+        maps: RustyMapsMut<'a>,
         top: Arc<Topology>,
         task_loads: &'b mut BTreeMap<i32, TaskLoad>,
         load_decay_factor: f64,
@@ -560,7 +560,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                 .context("Failed to lookup task_data")?
             {
                 let task_ctx =
-                    unsafe { &*(task_ctx_vec.as_slice().as_ptr() as *const atropos_sys::task_ctx) };
+                    unsafe { &*(task_ctx_vec.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
                 let pid = i32::from_ne_bytes(
                     key.as_slice()
                         .try_into()
@@ -830,7 +830,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 }
 
 struct Scheduler<'a> {
-    skel: AtroposSkel<'a>,
+    skel: RustySkel<'a>,
     struct_ops: Option<libbpf_rs::Link>,
 
     sched_interval: Duration,
@@ -854,16 +854,16 @@ struct Scheduler<'a> {
 impl<'a> Scheduler<'a> {
     fn init(opts: &Opts) -> Result<Self> {
         // Open the BPF prog first for verification.
-        let mut skel_builder = AtroposSkelBuilder::default();
+        let mut skel_builder = RustySkelBuilder::default();
         skel_builder.obj_builder.debug(opts.verbose > 0);
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
         let nr_cpus = libbpf_rs::num_possible_cpus().unwrap();
-        if nr_cpus > atropos_sys::MAX_CPUS as usize {
+        if nr_cpus > rusty_sys::MAX_CPUS as usize {
             bail!(
                 "nr_cpus ({}) is greater than MAX_CPUS ({})",
                 nr_cpus,
-                atropos_sys::MAX_CPUS
+                rusty_sys::MAX_CPUS
             );
         }
 
@@ -905,11 +905,11 @@ impl<'a> Scheduler<'a> {
         skel.attach().context("Failed to attach BPF program")?;
         let struct_ops = Some(
             skel.maps_mut()
-                .atropos()
+                .rusty()
                 .attach_struct_ops()
-                .context("Failed to attach atropos struct ops")?,
+                .context("Failed to attach rusty struct ops")?,
         );
-        info!("Atropos Scheduler Attached");
+        info!("Rusty Scheduler Attached");
 
         // Other stuff.
         let proc_reader = procfs::ProcReader::new();
@@ -996,7 +996,7 @@ impl<'a> Scheduler<'a> {
         let mut stats: Vec<u64> = Vec::new();
         let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.top.nr_cpus];
 
-        for stat in 0..atropos_sys::stat_idx_ATROPOS_NR_STATS {
+        for stat in 0..rusty_sys::stat_idx_RUSTY_NR_STATS {
             let cpu_stat_vec = stats_map
                 .lookup_percpu(&stat.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
                 .with_context(|| format!("Failed to lookup stat {}", stat))?
@@ -1033,22 +1033,22 @@ impl<'a> Scheduler<'a> {
         imbal: &[f64],
     ) {
         let stat = |idx| stats[idx as usize];
-        let total = stat(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY_IDLE)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_PINNED)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY_FAR)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH)
-            + stat(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY);
+        let total = stat(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_PINNED)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH)
+            + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY);
 
         info!(
             "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms",
             cpu_busy * 100.0,
-            stats[atropos_sys::stat_idx_ATROPOS_STAT_LOAD_BALANCE as usize],
+            stats[rusty_sys::stat_idx_RUSTY_STAT_LOAD_BALANCE as usize],
             load_avg,
-            stats[atropos_sys::stat_idx_ATROPOS_STAT_TASK_GET_ERR as usize],
+            stats[rusty_sys::stat_idx_RUSTY_STAT_TASK_GET_ERR as usize],
             self.nr_lb_data_errors,
             processing_dur.as_millis(),
         );
@@ -1058,25 +1058,25 @@ impl<'a> Scheduler<'a> {
         info!(
             "tot={:7} wsync={:5.2} prev_idle={:5.2} greedy_idle={:5.2} pin={:5.2}",
             total,
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY_IDLE),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_PINNED),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PINNED),
         );
 
         info!(
             "dir={:5.2} dir_greedy={:5.2} dir_greedy_far={:5.2}",
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_GREEDY_FAR),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR),
         );
 
         info!(
             "dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}",
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_GREEDY),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_KICK_GREEDY),
-            stat_pct(atropos_sys::stat_idx_ATROPOS_STAT_REPATRIATE),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_KICK_GREEDY),
+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_REPATRIATE),
         );
 
         let ti = &self.skel.bss().tune_input;
diff --git a/tools/sched_ext/scx_atropos/src/atropos_sys.rs b/tools/sched_ext/scx_rusty/src/rusty_sys.rs
similarity index 83%
rename from tools/sched_ext/scx_atropos/src/atropos_sys.rs
rename to tools/sched_ext/scx_rusty/src/rusty_sys.rs
index bbeaf856d40e8..e948d81e7356e 100644
--- a/tools/sched_ext/scx_atropos/src/atropos_sys.rs
+++ b/tools/sched_ext/scx_rusty/src/rusty_sys.rs
@@ -7,4 +7,4 @@
 #![allow(non_snake_case)]
 #![allow(dead_code)]
 
-include!(concat!(env!("OUT_DIR"), "/atropos-sys.rs"));
+include!(concat!(env!("OUT_DIR"), "/rusty_sys.rs"));

From 6294bf394de4f22cea9a439a1aee71e167f8841e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 31 Jul 2023 16:12:46 -0500
Subject: [PATCH 090/304] scx: Update kfree_rcu() to take rcu argument

In commit 7e3f926bf453 ("rcu/kvfree: Eliminate k[v]free_rcu() single
argument macro"), kfree_rcu() was updated to take the rcu head in
addition to just the name of the container. Let's update our usage of
kfree_rcu() to match the new one.

Reported-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0db6d400d340e..051c79fa25f7a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2704,7 +2704,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
 	struct scx_dispatch_q *dsq, *tmp_dsq;
 
 	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
-		kfree_rcu(dsq);
+		kfree_rcu(dsq, rcu);
 }
 
 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);

From bb578c2eb7a56aca119794efa6fc56be4f82bac6 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 31 Jul 2023 16:15:19 -0500
Subject: [PATCH 091/304] scx: List bpf_cpumask_any_*_distribute() kfuncs in
 header

The scx_common.bpf.h header lists some kfunc definitions that schedulers
can use. Two of them are bpf_cpumask_any() and bpf_cpumask_any_and().
Those were recently replaced by bpf_cpumask_any_distribute() and
bpf_cpumask_any_and_distribute(), so let's update the header file
accordingly.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_common.bpf.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 5a6136dceb4d4..06cd1892af3b3 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -179,8 +179,9 @@ bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
 bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
 bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
 void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
-u32 bpf_cpumask_any(const struct cpumask *cpumask) __ksym;
-u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+				   const struct cpumask *src2) __ksym;
 
 /* rcu */
 void bpf_rcu_read_lock(void) __ksym;

From d04dbc0a79917a9c2bdbac14b2baf0850697d4ee Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 1 Aug 2023 13:57:18 -0500
Subject: [PATCH 092/304] scx: Fix README entry about rusty

Atropos is now Rusty, update the documentation

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/README | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/sched_ext/README b/tools/sched_ext/README
index 33f413f8a4034..b17014aa96a4a 100644
--- a/tools/sched_ext/README
+++ b/tools/sched_ext/README
@@ -42,7 +42,7 @@ you want to compile the benchmarks.
 
 2. rustup nightly
 
-Atropos's user space load balancing component is written in Rust, and uses
+Rusty's user space load balancing component is written in Rust, and uses
 nightly features. You'll need to use the nightly build from rustup in order to
 compile it.
 
@@ -67,8 +67,8 @@ schedulers.
 
 --------------------------------------------------------------------------------
 
-Atropos
--------
+Rusty
+-----
 
 Overview
 ~~~~~~~~
@@ -81,16 +81,17 @@ how tasks should be load balanced accordingly.
 Typical Use Case
 ~~~~~~~~~~~~~~~~
 
-Atropos is designed to be flexible, and accommodate different architectures and
+Rusty is designed to be flexible, and accommodate different architectures and
 workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
-as well as how Atropos should partition the system into scheduling domains, can
+as well as how Rusty should partition the system into scheduling domains, can
 be tuned to achieve the optimal configuration for any given system or workload.
 
 Production Ready?
 ~~~~~~~~~~~~~~~~~
 
-Yes. If tuned correctly, Atropos should be performant across various CPU
-architectures and workloads.
+Yes. If tuned correctly, Rusty should be performant across various CPU
+architectures and workloads. Rusty by default creates a separate scheduling
+domain per-LLC, so its default configuration may be performant as well.
 
 --------------------------------------------------------------------------------
 
@@ -235,7 +236,7 @@ Overview
 ~~~~~~~~
 
 A simple weighted vtime scheduler where all scheduling decisions take place in
-user space. This is in contrast to Atropos, where load balancing lives in user
+user space. This is in contrast to Rusty, where load balancing lives in user
 space, but scheduling decisions are still made in the kernel.
 
 Typical Use Case

From 0799ae6a0c8411118622b0404e848a69f265f3c4 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 1 Aug 2023 14:03:40 -0500
Subject: [PATCH 093/304] scx: Update clang dependency to 16.0

We don't actually need clang 17.0.0. Clang 16.0.0 has 64-bit enum
support, and the zero/sign extension fix. Let's update the README
accordingly.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/README | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/sched_ext/README b/tools/sched_ext/README
index b17014aa96a4a..ac326560ab2c8 100644
--- a/tools/sched_ext/README
+++ b/tools/sched_ext/README
@@ -30,16 +30,13 @@ There are a few toolchain dependencies for compiling the example schedulers.
 Toolchain dependencies
 ----------------------
 
-1. clang >= 17.0
+1. clang >= 16.0.0
 
 The schedulers are BPF programs, and therefore must be compiled with clang. gcc
 is actively working on adding a BPF backend compiler as well, but are still
 missing some features such as BTF type tags which are necessary for using
 kptrs.
 
-clang 17.0 has not yet been released, so you'll need to compile it yourself if
-you want to compile the benchmarks.
-
 2. rustup nightly
 
 Rusty's user space load balancing component is written in Rust, and uses

From 2d87e47747b4b2a9ac11383d8a92f87cc5b16fc9 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 1 Aug 2023 09:16:17 -0500
Subject: [PATCH 094/304] scx: Adjust a couple of small things in rusty.bpf.c

rusty.bpf.c has a few small places where we can improve either the
formatting of the code, or the logic. In rusty_select_cpu(), we declare
the idle_smtmask as struct cpumask *, when it could be const. Also, when
initializing the pcpu_ctx, we're using an actual for-loop instead of
bpf_for. Let's just fix up these small issues.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 2c68b659cacd1..59adec80ed29d 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -273,7 +273,7 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
-	struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
+	const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
 	struct task_ctx *task_ctx;
 	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
@@ -933,7 +933,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
 			return ret;
 	}
 
-	for (u32 i = 0; i < nr_cpus; i++)
+	bpf_for(i, 0, nr_cpus)
 		pcpu_ctx[i].dom_rr_cur = i;
 
 	cpumask = bpf_cpumask_create();

From 4c528368233229e44da0c6326e036c97a48ad294 Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 2 Aug 2023 06:31:32 -0700
Subject: [PATCH 095/304] scx: Rename "type" -> "exit_type"

When used from a bpf scheduler that is launched via libbpf-rs this
naming runs into issues because "type" is a reserved keyword in Rust.

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 tools/sched_ext/user_exit_info.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/user_exit_info.h
index e701ef0e0b86c..9bb0b46480e78 100644
--- a/tools/sched_ext/user_exit_info.h
+++ b/tools/sched_ext/user_exit_info.h
@@ -11,7 +11,7 @@
 #define __USER_EXIT_INFO_H
 
 struct user_exit_info {
-	int		type;
+	int		exit_type;
 	char		reason[128];
 	char		msg[1024];
 };
@@ -27,7 +27,7 @@ static inline void uei_record(struct user_exit_info *uei,
 	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
 	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
 	/* use __sync to force memory barrier */
-	__sync_val_compare_and_swap(&uei->type, uei->type, ei->type);
+	__sync_val_compare_and_swap(&uei->exit_type, uei->exit_type, ei->type);
 }
 
 #else	/* !__bpf__ */
@@ -35,7 +35,7 @@ static inline void uei_record(struct user_exit_info *uei,
 static inline bool uei_exited(struct user_exit_info *uei)
 {
 	/* use __sync to force memory barrier */
-	return __sync_val_compare_and_swap(&uei->type, -1, -1);
+	return __sync_val_compare_and_swap(&uei->exit_type, -1, -1);
 }
 
 static inline void uei_print(const struct user_exit_info *uei)

From 8b8596e436fa3bd487a56306b7e837b056439fc5 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 3 Aug 2023 14:50:50 -0500
Subject: [PATCH 096/304] scx: Make cpumask arg to ops.set_cpumask() const

The struct cpumask * argument to the ops.set_cpumask() op isn't const.
It doesn't really matter in terms of mutability in a BPF program, but
let's make it const just because it really is.

Signed-off-by: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 26537b2f6c95c..1ba6d3690242c 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -352,7 +352,8 @@ struct sched_ext_ops {
 	 *
 	 * Update @p's CPU affinity to @cpumask.
 	 */
-	void (*set_cpumask)(struct task_struct *p, struct cpumask *cpumask);
+	void (*set_cpumask)(struct task_struct *p,
+			    const struct cpumask *cpumask);
 
 	/**
 	 * update_idle - Update the idle state of a CPU

From f0fd99d42e0945d9183ae0745f8a183de8746cca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Aug 2023 22:33:15 -1000
Subject: [PATCH 097/304] scx: Use unsigned long for rq->scx.pnt_seq instead of
 u64

Andrea Righi reports that smp_load_acquire() can't be used on u64's on some
32bit architectures. pnt_seq is used to close a very short race window and
32bit should be more than enough. Use unsigned long instead of u64.
---
 kernel/sched/ext.c   | 4 ++--
 kernel/sched/sched.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 051c79fa25f7a..3e224277af777 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -129,7 +129,7 @@ static struct {
 #endif	/* CONFIG_SMP */
 
 /* for %SCX_KICK_WAIT */
-static u64 __percpu *scx_kick_cpus_pnt_seqs;
+static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
 
 /*
  * Direct dispatch marker.
@@ -3634,7 +3634,7 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
-	u64 *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
 	int this_cpu = cpu_of(this_rq);
 	int cpu;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e1b436724d5d9..d2e827c98c8d3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -706,7 +706,7 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
-	u64			pnt_seq;
+	unsigned long		pnt_seq;
 	struct irq_work		kick_cpus_irq_work;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */

From f2625bfab65e96d19921301d78eaff2523221614 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 8 Aug 2023 17:10:41 -0500
Subject: [PATCH 098/304] scx: Allow calling some kfuncs from tracepoints

Some of the sched_ext kfuncs are fine to call from tracepoints. For
example, we may want to call scx_bpf_error_bstr() if some error
condition is detected in a tracepoint rather than a sched_ext ops
callback. This patch therefore separates the scx_kfunc_ids_any kfunc BTF
set into two sets: one of which includes kfuncs that can only be called
from struct_ops, and the other which can be called from both struct_ops
and tracepoint progs.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 051c79fa25f7a..ed6f1e6fc829a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4365,17 +4365,25 @@ struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 }
 #endif
 
-BTF_SET8_START(scx_kfunc_ids_any)
+BTF_SET8_START(scx_kfunc_ids_ops_only)
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_SET8_END(scx_kfunc_ids_ops_only)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_ops_only,
+};
+
+BTF_SET8_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 #ifdef CONFIG_CGROUP_SCHED
@@ -4417,6 +4425,10 @@ static int __init register_ext_kfuncs(void)
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_cpu_release)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_ops_only)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_any)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
 					     &scx_kfunc_set_any))) {
 		pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
 		return ret;

From 1d0078547ab51032da7c88b8a054a115716e318f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Aug 2023 12:49:45 -1000
Subject: [PATCH 099/304] scx: Use atomic_long_t for scx_nr_rejected instead of
 atomic64_t

atomic64_t can be pretty inefficient in 32bit archs and the counter being
32bit on 32bit arch is fine. Let's use atomic_long_t instead.
---
 kernel/sched/ext.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3e224277af777..bf4fda692ae54 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -94,7 +94,7 @@ struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
 static atomic_t scx_exit_type = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info scx_exit_info;
 
-static atomic64_t scx_nr_rejected = ATOMIC64_INIT(0);
+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 
 /*
  * The maximum amount of time in jiffies that a task may be runnable without
@@ -2248,7 +2248,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		 */
 		if (p->policy == SCHED_EXT) {
 			p->policy = SCHED_NORMAL;
-			atomic64_inc(&scx_nr_rejected);
+			atomic_long_inc(&scx_nr_rejected);
 		}
 
 		task_rq_unlock(rq, p, &rf);
@@ -3200,7 +3200,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	atomic_set(&scx_exit_type, SCX_EXIT_NONE);
 	scx_warned_zero_slice = false;
 
-	atomic64_set(&scx_nr_rejected, 0);
+	atomic_long_set(&scx_nr_rejected, 0);
 
 	/*
 	 * Keep CPUs stable during enable so that the BPF scheduler can track
@@ -3414,8 +3414,8 @@ static int scx_debug_show(struct seq_file *m, void *v)
 	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
 	seq_printf(m, "%-30s: %s\n", "enable_state",
 		   scx_ops_enable_state_str[scx_ops_enable_state()]);
-	seq_printf(m, "%-30s: %llu\n", "nr_rejected",
-		   atomic64_read(&scx_nr_rejected));
+	seq_printf(m, "%-30s: %lu\n", "nr_rejected",
+		   atomic_long_read(&scx_nr_rejected));
 	mutex_unlock(&scx_ops_enable_mutex);
 	return 0;
 }

From e453cbb528344b862ca172255573d231dc961c97 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Aug 2023 13:22:35 -1000
Subject: [PATCH 100/304] scx: Make p->scx.ops_state atomic_long_t instead of
 atomic64_t

Some 32bit archs can't do 64bit store_release/load_acquire. Use
atomic_long_t instead.
---
 include/linux/sched/ext.h |  2 +-
 kernel/sched/ext.c        | 55 +++++++++++++++++++++++----------------
 kernel/sched/sched.h      |  2 +-
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1ba6d3690242c..8a2d8eaefd33f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -651,7 +651,7 @@ struct sched_ext_entity {
 	s32			holding_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
-	atomic64_t		ops_state;
+	atomic_long_t		ops_state;
 	unsigned long		runnable_at;
 #ifdef CONFIG_SCHED_CORE
 	u64			core_sched_at;	/* see scx_prio_less() */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bf4fda692ae54..1e4a8957855ec 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -55,12 +55,19 @@ enum scx_ops_state {
 	 * QSEQ brands each QUEUED instance so that, when dispatch races
 	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
 	 * on the task being dispatched.
+	 *
+	 * As some 32bit archs can't do 64bit store_release/load_acquire,
+	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
+	 * and runs with IRQ disabled. 30 bits should be sufficient.
 	 */
 	SCX_OPSS_QSEQ_SHIFT	= 2,
-	SCX_OPSS_STATE_MASK	= (1LLU << SCX_OPSS_QSEQ_SHIFT) - 1,
-	SCX_OPSS_QSEQ_MASK	= ~SCX_OPSS_STATE_MASK,
 };
 
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
+
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
  * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -155,7 +162,7 @@ static LLIST_HEAD(dsqs_to_free);
 /* dispatch buf */
 struct scx_dsp_buf_ent {
 	struct task_struct	*task;
-	u64			qseq;
+	unsigned long		qseq;
 	u64			dsq_id;
 	u64			enq_flags;
 };
@@ -491,11 +498,11 @@ static bool scx_ops_disabling(void)
  * has load_acquire semantics to ensure that the caller can see the updates made
  * in the enqueueing and dispatching paths.
  */
-static void wait_ops_state(struct task_struct *p, u64 opss)
+static void wait_ops_state(struct task_struct *p, unsigned long opss)
 {
 	do {
 		cpu_relax();
-	} while (atomic64_read_acquire(&p->scx.ops_state) == opss);
+	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
 }
 
 /**
@@ -645,7 +652,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	 * match waiters' load_acquire.
 	 */
 	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
-		atomic64_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
 
 	if (is_local) {
 		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
@@ -811,7 +818,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 			    int sticky_cpu)
 {
 	struct task_struct **ddsp_taskp;
-	u64 qseq;
+	unsigned long qseq;
 
 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
 
@@ -852,8 +859,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
 
-	WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
-	atomic64_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
+	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
 
 	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
 	WARN_ON_ONCE(*ddsp_taskp);
@@ -866,7 +873,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	 * dequeue may be waiting. The store_release matches their load_acquire.
 	 */
 	if (*ddsp_taskp == p)
-		atomic64_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
 	*ddsp_taskp = NULL;
 	return;
 
@@ -948,12 +955,12 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 
 static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 {
-	u64 opss;
+	unsigned long opss;
 
 	watchdog_unwatch_task(p, false);
 
 	/* acquire ensures that we see the preceding updates on QUEUED */
-	opss = atomic64_read_acquire(&p->scx.ops_state);
+	opss = atomic_long_read_acquire(&p->scx.ops_state);
 
 	switch (opss & SCX_OPSS_STATE_MASK) {
 	case SCX_OPSS_NONE:
@@ -968,8 +975,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 		if (SCX_HAS_OP(dequeue))
 			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
 
-		if (atomic64_try_cmpxchg(&p->scx.ops_state, &opss,
-					 SCX_OPSS_NONE))
+		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+					    SCX_OPSS_NONE))
 			break;
 		fallthrough;
 	case SCX_OPSS_DISPATCHING:
@@ -987,7 +994,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
 		 */
 		wait_ops_state(p, SCX_OPSS_DISPATCHING);
-		BUG_ON(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
 		break;
 	}
 }
@@ -1338,7 +1345,7 @@ dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
 		p->scx.holding_cpu = raw_smp_processor_id();
 
 		/* store_release ensures that dequeue sees the above */
-		atomic64_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
 
 		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
 
@@ -1406,11 +1413,12 @@ dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
  * BPF scheduler and claim the ownership before dispatching.
  */
 static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
-			    struct task_struct *p, u64 qseq_at_dispatch,
+			    struct task_struct *p,
+			    unsigned long qseq_at_dispatch,
 			    u64 dsq_id, u64 enq_flags)
 {
 	struct scx_dispatch_q *dsq;
-	u64 opss;
+	unsigned long opss;
 
 	touch_core_sched_dispatch(rq, p);
 retry:
@@ -1418,7 +1426,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
 	 * No need for _acquire here. @p is accessed only after a successful
 	 * try_cmpxchg to DISPATCHING.
 	 */
-	opss = atomic64_read(&p->scx.ops_state);
+	opss = atomic_long_read(&p->scx.ops_state);
 
 	switch (opss & SCX_OPSS_STATE_MASK) {
 	case SCX_OPSS_DISPATCHING:
@@ -1441,8 +1449,8 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
 		 * claim @p by atomically transitioning it from QUEUED to
 		 * DISPATCHING.
 		 */
-		if (likely(atomic64_try_cmpxchg(&p->scx.ops_state, &opss,
-						SCX_OPSS_DISPATCHING)))
+		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+						   SCX_OPSS_DISPATCHING)))
 			break;
 		goto retry;
 	case SCX_OPSS_QUEUEING:
@@ -3819,7 +3827,7 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
 
 	this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){
 		.task = p,
-		.qseq = atomic64_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
+		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
 		.dsq_id = dsq_id,
 		.enq_flags = enq_flags,
 	};
@@ -4011,7 +4019,8 @@ u32 scx_bpf_reenqueue_local(void)
 		struct task_struct *p;
 
 		p = first_local_task(rq);
-		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) !=
+			     SCX_OPSS_NONE);
 		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
 		WARN_ON_ONCE(p->scx.holding_cpu != -1);
 		dispatch_dequeue(scx_rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2e827c98c8d3..4183587f9541f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -698,7 +698,7 @@ enum scx_rq_flags {
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
 	struct list_head	watchdog_list;
-	u64			ops_qseq;
+	unsigned long		ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
 	u32			flags;

From 845aec954e4afc354957493750c26b3e994e4465 Mon Sep 17 00:00:00 2001
From: inwardvessel <5782523+inwardvessel@users.noreply.github.com>
Date: Thu, 15 Jun 2023 15:31:44 -0700
Subject: [PATCH 101/304] use resizing of datasec maps in examples

---
 tools/sched_ext/scx_central.bpf.c | 17 ++++++-----
 tools/sched_ext/scx_central.c     |  4 +++
 tools/sched_ext/scx_common.bpf.h  | 48 +++++++++++++++++++++++++++++++
 tools/sched_ext/scx_pair.bpf.c    | 19 ++++++------
 tools/sched_ext/scx_pair.c        | 30 ++++++++++++-------
 tools/sched_ext/scx_pair.h        |  1 -
 tools/sched_ext/scx_user_common.h | 23 +++++++++++++++
 7 files changed, 112 insertions(+), 30 deletions(-)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index f44b9365a1778..67e6412bd5d83 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -51,14 +51,13 @@ char _license[] SEC("license") = "GPL";
 
 enum {
 	FALLBACK_DSQ_ID		= 0,
-	MAX_CPUS		= 4096,
 	MS_TO_NS		= 1000LLU * 1000,
 	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
 };
 
 const volatile bool switch_partial;
 const volatile s32 central_cpu;
-const volatile u32 nr_cpu_ids = 64;	/* !0 for veristat, set during init */
+const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
 
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
 u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
@@ -73,8 +72,8 @@ struct {
 } central_q SEC(".maps");
 
 /* can't use percpu map due to bad lookups */
-static bool cpu_gimme_task[MAX_CPUS];
-static u64 cpu_started_at[MAX_CPUS];
+bool RESIZABLE_ARRAY(data, cpu_gimme_task);
+u64 RESIZABLE_ARRAY(data, cpu_started_at);
 
 struct central_timer {
 	struct bpf_timer timer;
@@ -189,7 +188,7 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 				break;
 
 			/* central's gimme is never set */
-			gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
 			if (gimme && !*gimme)
 				continue;
 
@@ -220,7 +219,7 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 		if (scx_bpf_consume(FALLBACK_DSQ_ID))
 			return;
 
-		gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
 		if (gimme)
 			*gimme = true;
 
@@ -235,7 +234,7 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 void BPF_STRUCT_OPS(central_running, struct task_struct *p)
 {
 	s32 cpu = scx_bpf_task_cpu(p);
-	u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
 	if (started_at)
 		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
 }
@@ -243,7 +242,7 @@ void BPF_STRUCT_OPS(central_running, struct task_struct *p)
 void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
 {
 	s32 cpu = scx_bpf_task_cpu(p);
-	u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
 	if (started_at)
 		*started_at = 0;
 }
@@ -262,7 +261,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 			continue;
 
 		/* kick iff the current one exhausted its slice */
-		started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+		started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
 		if (started_at && *started_at &&
 		    vtime_before(now, *started_at + SCX_SLICE_DFL))
 			continue;
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index a303401ffe1a1..580d4b50172fa 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -63,6 +63,10 @@ int main(int argc, char **argv)
 		}
 	}
 
+	/* Resize arrays so their element count is equal to cpu count. */
+	RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids);
+
 	SCX_BUG_ON(scx_central__load(skel), "Failed to load skel");
 
 	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 06cd1892af3b3..81bfe3d041c9a 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -81,6 +81,26 @@ BPF_PROG(name, ##args)
 SEC("struct_ops.s/"#name)							\
 BPF_PROG(name, ##args)
 
+/**
+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
+ * @elfsec: the data section of the BPF program in which to place the array
+ * @arr: the name of the array
+ *
+ * libbpf has an API for setting map value sizes. Since data sections (i.e.
+ * bss, data, rodata) themselves are maps, a data section can be resized. If
+ * a data section has an array as its last element, the BTF info for that
+ * array will be adjusted so that length of the array is extended to meet the
+ * new length of the data section. This macro annotates an array to have an
+ * element count of one with the assumption that this array can be resized
+ * within the userspace program. It also annotates the section specifier so
+ * this array exists in a custom sub data section which can be resized
+ * independently.
+ *
+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
+ * array declared with RESIZABLE_ARRAY().
+ */
+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
+
 /**
  * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
  * @base: struct or array to index
@@ -117,6 +137,34 @@ BPF_PROG(name, ##args)
 	__addr;									\
 })
 
+/**
+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
+ * @arr: array to index into
+ * @i: array index
+ * @n: number of elements in array
+ *
+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the
+ * element count needs to be explicit.
+ * It can be used in cases where a global array is defined with an initial
+ * size but is intended to be be resized before loading the BPF program.
+ * Without this version of the macro, MEMBER_VPTR() will use the compile time
+ * size of the array to compute the max, which will result in rejection by
+ * the verifier.
+ */
+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({	  \
+	u64 __base = (u64)arr;				  \
+	u64 __addr = (u64)&(arr[i]) - __base;		  \
+	asm volatile (					  \
+		"if %0 <= %[max] goto +2\n"		  \
+		"%0 = 0\n"				  \
+		"goto +1\n"				  \
+		"%0 += %1\n"				  \
+		: "+r"(__addr)				  \
+		: "r"(__base),				  \
+		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));  \
+	__addr;						  \
+})
+
 /*
  * BPF core and other generic helpers
  */
diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c
index cda126980ed51..9c9cf97f4feeb 100644
--- a/tools/sched_ext/scx_pair.bpf.c
+++ b/tools/sched_ext/scx_pair.bpf.c
@@ -123,19 +123,19 @@ char _license[] SEC("license") = "GPL";
 const volatile bool switch_partial;
 
 /* !0 for veristat, set during init */
-const volatile u32 nr_cpu_ids = 64;
+const volatile u32 nr_cpu_ids = 1;
 
 /* a pair of CPUs stay on a cgroup for this duration */
 const volatile u32 pair_batch_dur_ns = SCX_SLICE_DFL;
 
 /* cpu ID -> pair cpu ID */
-const volatile s32 pair_cpu[MAX_CPUS] = { [0 ... MAX_CPUS - 1] = -1 };
+const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);
 
 /* cpu ID -> pair_id */
-const volatile u32 pair_id[MAX_CPUS];
+const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);
 
 /* CPU ID -> CPU # in the pair (0 or 1) */
-const volatile u32 in_pair_idx[MAX_CPUS];
+const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);
 
 struct pair_ctx {
 	struct bpf_spin_lock	lock;
@@ -161,7 +161,6 @@ struct pair_ctx {
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, MAX_CPUS / 2);
 	__type(key, u32);
 	__type(value, struct pair_ctx);
 } pair_ctx SEC(".maps");
@@ -299,7 +298,7 @@ static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
 {
 	u32 *vptr;
 
-	vptr = (u32 *)MEMBER_VPTR(pair_id, [cpu]);
+	vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
 	if (!vptr)
 		return -EINVAL;
 
@@ -307,7 +306,7 @@ static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
 	if (!(*pairc))
 		return -EINVAL;
 
-	vptr = (u32 *)MEMBER_VPTR(in_pair_idx, [cpu]);
+	vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
 	if (!vptr)
 		return -EINVAL;
 
@@ -490,7 +489,7 @@ static int try_dispatch(s32 cpu)
 
 out_maybe_kick:
 	if (kick_pair) {
-		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
 		if (pair) {
 			__sync_fetch_and_add(&nr_kicks, 1);
 			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
@@ -525,7 +524,7 @@ void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args
 	bpf_spin_unlock(&pairc->lock);
 
 	if (kick_pair) {
-		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
 
 		if (pair) {
 			__sync_fetch_and_add(&nr_kicks, 1);
@@ -554,7 +553,7 @@ void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args
 	bpf_spin_unlock(&pairc->lock);
 
 	if (kick_pair) {
-		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
 
 		if (pair) {
 			__sync_fetch_and_add(&nr_kicks, 1);
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index c2de48430c5b3..9e6f3109653c2 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -67,27 +67,37 @@ int main(int argc, char **argv)
 		}
 	}
 
+	bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
+
+	/* Resize arrays so their element count is equal to cpu count. */
+	RESIZE_ARRAY(rodata, pair_cpu, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(rodata, pair_id, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(rodata, in_pair_idx, skel->rodata->nr_cpu_ids);
+
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++)
+		skel->rodata_pair_cpu->pair_cpu[i] = -1;
+
 	printf("Pairs: ");
 	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
 		int j = (i + stride) % skel->rodata->nr_cpu_ids;
 
-		if (skel->rodata->pair_cpu[i] >= 0)
+		if (skel->rodata_pair_cpu->pair_cpu[i] >= 0)
 			continue;
 
 		SCX_BUG_ON(i == j,
 			   "Invalid stride %d - CPU%d wants to be its own pair",
 			   stride, i);
 
-		SCX_BUG_ON(skel->rodata->pair_cpu[j] >= 0,
+		SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0,
 			   "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair",
-			   stride, i, j, skel->rodata->pair_cpu[j]);
-
-		skel->rodata->pair_cpu[i] = j;
-		skel->rodata->pair_cpu[j] = i;
-		skel->rodata->pair_id[i] = i;
-		skel->rodata->pair_id[j] = i;
-		skel->rodata->in_pair_idx[i] = 0;
-		skel->rodata->in_pair_idx[j] = 1;
+			   stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]);
+
+		skel->rodata_pair_cpu->pair_cpu[i] = j;
+		skel->rodata_pair_cpu->pair_cpu[j] = i;
+		skel->rodata_pair_id->pair_id[i] = i;
+		skel->rodata_pair_id->pair_id[j] = i;
+		skel->rodata_in_pair_idx->in_pair_idx[i] = 0;
+		skel->rodata_in_pair_idx->in_pair_idx[j] = 1;
 
 		printf("[%d, %d] ", i, j);
 	}
diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h
index f60b824272f75..d9666a447d3fd 100644
--- a/tools/sched_ext/scx_pair.h
+++ b/tools/sched_ext/scx_pair.h
@@ -2,7 +2,6 @@
 #define __SCX_EXAMPLE_PAIR_H
 
 enum {
-	MAX_CPUS		= 4096,
 	MAX_QUEUED		= 4096,
 	MAX_CGRPS		= 4096,
 };
diff --git a/tools/sched_ext/scx_user_common.h b/tools/sched_ext/scx_user_common.h
index 76a0d12eba28c..d5b7ce48cd6d7 100644
--- a/tools/sched_ext/scx_user_common.h
+++ b/tools/sched_ext/scx_user_common.h
@@ -31,4 +31,27 @@
 			SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__);	\
 	} while (0)
 
+/**
+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array
+ * @elfsec: the data section of the BPF program in which to the array exists
+ * @arr: the name of the array
+ * @n: the desired array element count
+ *
+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
+ * operations. It resizes the map which corresponds to the custom data
+ * section that contains the target array. As a side effect, the BTF info for
+ * the array is adjusted so that the array length is sized to cover the new
+ * data section size. The second operation is reassigning the skeleton pointer
+ * for that custom data section so that it points to the newly memory mapped
+ * region.
+ */
+#define RESIZE_ARRAY(elfsec, arr, n)						  \
+	do {									  \
+		size_t __sz;							  \
+		bpf_map__set_value_size(skel->maps.elfsec##_##arr,		  \
+				sizeof(skel->elfsec##_##arr->arr[0]) * (n));	  \
+		skel->elfsec##_##arr =						  \
+			bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \
+	} while (0)
+
 #endif	/* __SCHED_EXT_USER_COMMON_H */

From cb04f5610f018828342421fdab83cb956c9ad223 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 30 Aug 2023 11:54:08 -1000
Subject: [PATCH 102/304] scx: bpf_scx_btf_struct_access() should return
 -EACCES for unknown accesses

The function is currently returning 0 for unknown accesses which means
allowing writes to anything. Fix the default return value.
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0f89326b79d9e..b7a80233ea089 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3485,7 +3485,7 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 			return SCALAR_VALUE;
 	}
 
-	return 0;
+	return -EACCES;
 }
 
 static const struct bpf_func_proto *

From d377f5e36709e1eb02b0785417dfb497a7fece30 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Sep 2023 13:44:09 -1000
Subject: [PATCH 103/304] debug patches and fix

---
 include/linux/sched/ext.h |   7 +-
 kernel/sched/ext.c        |   8 +-
 noinline-everything.patch | 720 ++++++++++++++++++++++++++++++++++++++
 scx-event-track.patch     |  67 ++++
 4 files changed, 797 insertions(+), 5 deletions(-)
 create mode 100644 noinline-everything.patch
 create mode 100644 scx-event-track.patch

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 8a2d8eaefd33f..24f74ebeb7af1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -598,7 +598,6 @@ enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
 	SCX_TASK_ENQ_LOCAL	= 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
-	SCX_TASK_ON_DSQ_PRIQ	= 1 << 3, /* task is queued on the priority queue of a dsq */
 
 	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
 	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
@@ -609,6 +608,11 @@ enum scx_ent_flags {
 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
 };
 
+/* scx_entity.dsq_flags */
+enum scx_ent_dsq_flags {
+	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
+};
+
 /*
  * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
  * everywhere and the following bits track which kfunc sets are currently
@@ -646,6 +650,7 @@ struct sched_ext_entity {
 	} dsq_node;
 	struct list_head	watchdog_node;
 	u32			flags;		/* protected by rq lock */
+	u32			dsq_flags;	/* protected by dsq lock */
 	u32			weight;
 	s32			sticky_cpu;
 	s32			holding_cpu;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b7a80233ea089..bbad3a5cb42ab 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -620,7 +620,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
 
 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
-	WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
+	WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
 		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
 
 	if (!is_local) {
@@ -635,7 +635,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	}
 
 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
-		p->scx.flags |= SCX_TASK_ON_DSQ_PRIQ;
+		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
 			      scx_dsq_priq_less);
 	} else {
@@ -675,10 +675,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 static void task_unlink_from_dsq(struct task_struct *p,
 				 struct scx_dispatch_q *dsq)
 {
-	if (p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) {
+	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
 		rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
 		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-		p->scx.flags &= ~SCX_TASK_ON_DSQ_PRIQ;
+		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
 	} else {
 		list_del_init(&p->scx.dsq_node.fifo);
 	}
diff --git a/noinline-everything.patch b/noinline-everything.patch
new file mode 100644
index 0000000000000..882349fd77071
--- /dev/null
+++ b/noinline-everything.patch
@@ -0,0 +1,720 @@
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index b7a80233ea08..b9f6e50aaafe 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -814,8 +814,8 @@ static bool test_rq_online(struct rq *rq)
+ #endif
+ }
+ 
+-static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+-			    int sticky_cpu)
++static noinline void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
++				     int sticky_cpu)
+ {
+ 	struct task_struct **ddsp_taskp;
+ 	unsigned long qseq;
+@@ -895,12 +895,12 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+ 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
+ }
+ 
+-static bool watchdog_task_watched(const struct task_struct *p)
++static noinline bool watchdog_task_watched(const struct task_struct *p)
+ {
+ 	return !list_empty(&p->scx.watchdog_node);
+ }
+ 
+-static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
++static noinline void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+ {
+ 	lockdep_assert_rq_held(rq);
+ 	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
+@@ -909,14 +909,14 @@ static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+ 	list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
+ }
+ 
+-static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
++static noinline void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
+ {
+ 	list_del_init(&p->scx.watchdog_node);
+ 	if (reset_timeout)
+ 		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+ }
+ 
+-static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
++static noinline void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+ {
+ 	int sticky_cpu = p->scx.sticky_cpu;
+ 
+@@ -953,7 +953,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
+ 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+ }
+ 
+-static void ops_dequeue(struct task_struct *p, u64 deq_flags)
++static noinline void ops_dequeue(struct task_struct *p, u64 deq_flags)
+ {
+ 	unsigned long opss;
+ 
+@@ -999,7 +999,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+ 	}
+ }
+ 
+-static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
++static noinline void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+ {
+ 	struct scx_rq *scx_rq = &rq->scx;
+ 
+@@ -1042,7 +1042,7 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
+ 	dispatch_dequeue(scx_rq, p);
+ }
+ 
+-static void yield_task_scx(struct rq *rq)
++static noinline void yield_task_scx(struct rq *rq)
+ {
+ 	struct task_struct *p = rq->curr;
+ 
+@@ -1052,7 +1052,7 @@ static void yield_task_scx(struct rq *rq)
+ 		p->scx.slice = 0;
+ }
+ 
+-static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
++static noinline bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
+ {
+ 	struct task_struct *from = rq->curr;
+ 
+@@ -1087,8 +1087,8 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
+  * Returns %true if @p was successfully moved. %false after racing dequeue and
+  * losing.
+  */
+-static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
+-				   u64 enq_flags)
++static noinline bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
++					    u64 enq_flags)
+ {
+ 	struct rq *task_rq;
+ 
+@@ -1142,8 +1142,8 @@ static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
+  * @rq stays locked isn't important as long as the state is restored after
+  * dispatch_to_local_dsq_unlock().
+  */
+-static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
+-				       struct rq *src_rq, struct rq *dst_rq)
++static noinline void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
++						struct rq *src_rq, struct rq *dst_rq)
+ {
+ 	rq_unpin_lock(rq, rf);
+ 
+@@ -1171,8 +1171,8 @@ static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
+  *
+  * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
+  */
+-static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
+-					 struct rq *src_rq, struct rq *dst_rq)
++static noinline void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
++						  struct rq *src_rq, struct rq *dst_rq)
+ {
+ 	if (src_rq == dst_rq) {
+ 		raw_spin_rq_unlock(dst_rq);
+@@ -1191,14 +1191,14 @@ static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
+ #endif	/* CONFIG_SMP */
+ 
+ 
+-static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
++static noinline bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
+ {
+ 	return likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
+ 		cpumask_test_cpu(cpu_of(rq), p->cpus_ptr);
+ }
+ 
+-static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
+-			       struct scx_dispatch_q *dsq)
++static noinline bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
++					struct scx_dispatch_q *dsq)
+ {
+ 	struct scx_rq *scx_rq = &rq->scx;
+ 	struct task_struct *p;
+@@ -1293,7 +1293,7 @@ enum dispatch_to_local_dsq_ret {
+  * The caller must have exclusive ownership of @p (e.g. through
+  * %SCX_OPSS_DISPATCHING).
+  */
+-static enum dispatch_to_local_dsq_ret
++static noinline enum dispatch_to_local_dsq_ret
+ dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
+ 		      struct task_struct *p, u64 enq_flags)
+ {
+@@ -1412,7 +1412,7 @@ dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
+  * was valid in the first place. Make sure that the task is still owned by the
+  * BPF scheduler and claim the ownership before dispatching.
+  */
+-static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
++static noinline void finish_dispatch(struct rq *rq, struct rq_flags *rf,
+ 			    struct task_struct *p,
+ 			    unsigned long qseq_at_dispatch,
+ 			    u64 dsq_id, u64 enq_flags)
+@@ -1482,7 +1482,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
+ 	}
+ }
+ 
+-static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
++static noinline void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
+ {
+ 	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+ 	u32 u;
+@@ -1498,7 +1498,7 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
+ 	dspc->buf_cursor = 0;
+ }
+ 
+-static int balance_one(struct rq *rq, struct task_struct *prev,
++static noinline int balance_one(struct rq *rq, struct task_struct *prev,
+ 		       struct rq_flags *rf, bool local)
+ {
+ 	struct scx_rq *scx_rq = &rq->scx;
+@@ -1600,7 +1600,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
+ 	return 0;
+ }
+ 
+-static int balance_scx(struct rq *rq, struct task_struct *prev,
++static noinline int balance_scx(struct rq *rq, struct task_struct *prev,
+ 		       struct rq_flags *rf)
+ {
+ 	int ret;
+@@ -1642,7 +1642,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
+ 	return ret;
+ }
+ 
+-static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
++static noinline void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+ {
+ 	if (p->scx.flags & SCX_TASK_QUEUED) {
+ 		/*
+@@ -1676,7 +1676,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+ 	}
+ }
+ 
+-static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
++static noinline void put_prev_task_scx(struct rq *rq, struct task_struct *p)
+ {
+ #ifndef CONFIG_SMP
+ 	/*
+@@ -1756,7 +1756,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
+ 	}
+ }
+ 
+-static struct task_struct *first_local_task(struct rq *rq)
++static noinline struct task_struct *first_local_task(struct rq *rq)
+ {
+ 	struct rb_node *rb_node;
+ 
+@@ -1772,7 +1772,7 @@ static struct task_struct *first_local_task(struct rq *rq)
+ 	return NULL;
+ }
+ 
+-static struct task_struct *pick_next_task_scx(struct rq *rq)
++static noinline struct task_struct *pick_next_task_scx(struct rq *rq)
+ {
+ 	struct task_struct *p;
+ 
+@@ -1846,7 +1846,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+  * at the first task in the local dsq. @rq->curr has to be considered explicitly
+  * to mimic %SCX_TASK_BAL_KEEP.
+  */
+-static struct task_struct *pick_task_scx(struct rq *rq)
++static noinline struct task_struct *pick_task_scx(struct rq *rq)
+ {
+ 	struct task_struct *curr = rq->curr;
+ 	struct task_struct *first = first_local_task(rq);
+@@ -1878,7 +1878,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
+ }
+ #endif	/* CONFIG_SCHED_CORE */
+ 
+-static enum scx_cpu_preempt_reason
++static noinline enum scx_cpu_preempt_reason
+ preempt_reason_from_class(const struct sched_class *class)
+ {
+ #ifdef CONFIG_SMP
+@@ -1932,7 +1932,7 @@ void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
+ 
+ #ifdef CONFIG_SMP
+ 
+-static bool test_and_clear_cpu_idle(int cpu)
++static noinline bool test_and_clear_cpu_idle(int cpu)
+ {
+ #ifdef CONFIG_SCHED_SMT
+ 	/*
+@@ -1958,7 +1958,7 @@ static bool test_and_clear_cpu_idle(int cpu)
+ 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
+ }
+ 
+-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
++static noinline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
+ {
+ 	int cpu;
+ 
+@@ -1983,7 +1983,7 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
+ 		goto retry;
+ }
+ 
+-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
++static noinline s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+ {
+ 	s32 cpu;
+ 
+@@ -2040,7 +2040,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
+ 	return prev_cpu;
+ }
+ 
+-static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
++static noinline int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
+ {
+ 	if (SCX_HAS_OP(select_cpu)) {
+ 		s32 cpu;
+@@ -2058,7 +2058,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
+ 	}
+ }
+ 
+-static void set_cpus_allowed_scx(struct task_struct *p,
++static noinline void set_cpus_allowed_scx(struct task_struct *p,
+ 				 struct affinity_context *ac)
+ {
+ 	set_cpus_allowed_common(p, ac);
+@@ -2076,7 +2076,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
+ 				 (struct cpumask *)p->cpus_ptr);
+ }
+ 
+-static void reset_idle_masks(void)
++static noinline void reset_idle_masks(void)
+ {
+ 	/* consider all cpus idle, should converge to the actual state quickly */
+ 	cpumask_setall(idle_masks.cpu);
+@@ -2119,13 +2119,13 @@ void __scx_update_idle(struct rq *rq, bool idle)
+ #endif
+ }
+ 
+-static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
++static noinline void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
+ {
+ 	if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG)
+ 		SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq));
+ }
+ 
+-static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
++static noinline void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
+ {
+ 	if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG)
+ 		SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq));
+@@ -2133,13 +2133,13 @@ static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
+ 
+ #else /* !CONFIG_SMP */
+ 
+-static bool test_and_clear_cpu_idle(int cpu) { return false; }
+-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
+-static void reset_idle_masks(void) {}
++static noinline bool test_and_clear_cpu_idle(int cpu) { return false; }
++static noinline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
++static noinline void reset_idle_masks(void) {}
+ 
+ #endif /* CONFIG_SMP */
+ 
+-static bool check_rq_for_timeouts(struct rq *rq)
++static noinline bool check_rq_for_timeouts(struct rq *rq)
+ {
+ 	struct task_struct *p;
+ 	struct rq_flags rf;
+@@ -2166,7 +2166,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
+ 	return timed_out;
+ }
+ 
+-static void scx_watchdog_workfn(struct work_struct *work)
++static noinline void scx_watchdog_workfn(struct work_struct *work)
+ {
+ 	int cpu;
+ 
+@@ -2182,7 +2182,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
+ 			   scx_watchdog_timeout / 2);
+ }
+ 
+-static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
++static noinline void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+ {
+ 	update_curr_scx(rq);
+ 
+@@ -2200,7 +2200,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+ }
+ 
+ #ifdef CONFIG_EXT_GROUP_SCHED
+-static struct cgroup *tg_cgrp(struct task_group *tg)
++static noinline struct cgroup *tg_cgrp(struct task_group *tg)
+ {
+ 	/*
+ 	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+@@ -2221,7 +2221,7 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
+ 
+ #endif	/* CONFIG_EXT_GROUP_SCHED */
+ 
+-static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
++static noinline int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
+ {
+ 	int ret;
+ 
+@@ -2266,7 +2266,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
+ 	return 0;
+ }
+ 
+-static void scx_ops_enable_task(struct task_struct *p)
++static noinline void scx_ops_enable_task(struct task_struct *p)
+ {
+ 	lockdep_assert_rq_held(task_rq(p));
+ 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
+@@ -2281,7 +2281,7 @@ static void scx_ops_enable_task(struct task_struct *p)
+ 	p->scx.flags |= SCX_TASK_OPS_ENABLED;
+ }
+ 
+-static void scx_ops_disable_task(struct task_struct *p)
++static noinline void scx_ops_disable_task(struct task_struct *p)
+ {
+ 	lockdep_assert_rq_held(task_rq(p));
+ 
+@@ -2300,7 +2300,7 @@ static void scx_ops_disable_task(struct task_struct *p)
+ 	}
+ }
+ 
+-static void set_task_scx_weight(struct task_struct *p)
++static noinline void set_task_scx_weight(struct task_struct *p)
+ {
+ 	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+ 
+@@ -2317,7 +2317,7 @@ static void set_task_scx_weight(struct task_struct *p)
+  * created, priority is changed for a task on sched_ext, and a task is switched
+  * to sched_ext from other classes.
+  */
+-static void refresh_scx_weight(struct task_struct *p)
++static noinline void refresh_scx_weight(struct task_struct *p)
+ {
+ 	lockdep_assert_rq_held(task_rq(p));
+ 	set_task_scx_weight(p);
+@@ -2402,16 +2402,16 @@ void sched_ext_free(struct task_struct *p)
+ 	}
+ }
+ 
+-static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
++static noinline void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
+ {
+ 	refresh_scx_weight(p);
+ }
+ 
+-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
++static noinline void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+ {
+ }
+ 
+-static void switching_to_scx(struct rq *rq, struct task_struct *p)
++static noinline void switching_to_scx(struct rq *rq, struct task_struct *p)
+ {
+ 	refresh_scx_weight(p);
+ 
+@@ -2424,8 +2424,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
+ 				 (struct cpumask *)p->cpus_ptr);
+ }
+ 
+-static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+-static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
++static noinline void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
++static noinline void switched_to_scx(struct rq *rq, struct task_struct *p) {}
+ 
+ int scx_check_setscheduler(struct task_struct *p, int policy)
+ {
+@@ -2602,12 +2602,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+ 	percpu_up_read(&scx_cgroup_rwsem);
+ }
+ 
+-static void scx_cgroup_lock(void)
++static noinline void scx_cgroup_lock(void)
+ {
+ 	percpu_down_write(&scx_cgroup_rwsem);
+ }
+ 
+-static void scx_cgroup_unlock(void)
++static noinline void scx_cgroup_unlock(void)
+ {
+ 	percpu_up_write(&scx_cgroup_rwsem);
+ }
+@@ -2674,7 +2674,7 @@ DEFINE_SCHED_CLASS(ext) = {
+ #endif
+ };
+ 
+-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
++static noinline void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+ {
+ 	memset(dsq, 0, sizeof(*dsq));
+ 
+@@ -2683,7 +2683,7 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+ 	dsq->id = dsq_id;
+ }
+ 
+-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
++static noinline struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
+ {
+ 	struct scx_dispatch_q *dsq;
+ 	int ret;
+@@ -2706,7 +2706,7 @@ static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
+ 	return dsq;
+ }
+ 
+-static void free_dsq_irq_workfn(struct irq_work *irq_work)
++static noinline void free_dsq_irq_workfn(struct irq_work *irq_work)
+ {
+ 	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
+ 	struct scx_dispatch_q *dsq, *tmp_dsq;
+@@ -2717,7 +2717,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
+ 
+ static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
+ 
+-static void destroy_dsq(u64 dsq_id)
++static noinline void destroy_dsq(u64 dsq_id)
+ {
+ 	struct scx_dispatch_q *dsq;
+ 	unsigned long flags;
+@@ -2756,7 +2756,7 @@ static void destroy_dsq(u64 dsq_id)
+ }
+ 
+ #ifdef CONFIG_EXT_GROUP_SCHED
+-static void scx_cgroup_exit(void)
++static noinline void scx_cgroup_exit(void)
+ {
+ 	struct cgroup_subsys_state *css;
+ 
+@@ -2789,7 +2789,7 @@ static void scx_cgroup_exit(void)
+ 	rcu_read_unlock();
+ }
+ 
+-static int scx_cgroup_init(void)
++static noinline int scx_cgroup_init(void)
+ {
+ 	struct cgroup_subsys_state *css;
+ 	int ret;
+@@ -2834,7 +2834,7 @@ static int scx_cgroup_init(void)
+ 	return 0;
+ }
+ 
+-static void scx_cgroup_config_knobs(void)
++static noinline void scx_cgroup_config_knobs(void)
+ {
+ 	static DEFINE_MUTEX(cgintf_mutex);
+ 	DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { };
+@@ -2875,9 +2875,9 @@ static void scx_cgroup_config_knobs(void)
+ }
+ 
+ #else
+-static void scx_cgroup_exit(void) {}
+-static int scx_cgroup_init(void) { return 0; }
+-static void scx_cgroup_config_knobs(void) {}
++static noinline void scx_cgroup_exit(void) {}
++static noinline int scx_cgroup_init(void) { return 0; }
++static noinline void scx_cgroup_config_knobs(void) {}
+ #endif
+ 
+ /*
+@@ -2893,7 +2893,7 @@ bool task_should_scx(struct task_struct *p)
+ 	return p->policy == SCHED_EXT;
+ }
+ 
+-static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
++static noinline void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
+ {
+ 	if (enq_flags & SCX_ENQ_LAST)
+ 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+@@ -2901,9 +2901,9 @@ static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
+ 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+ }
+ 
+-static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
++static noinline void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
+ 
+-static void scx_ops_disable_workfn(struct kthread_work *work)
++static noinline void scx_ops_disable_workfn(struct kthread_work *work)
+ {
+ 	struct scx_exit_info *ei = &scx_exit_info;
+ 	struct scx_task_iter sti;
+@@ -3113,7 +3113,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
+ 
+ static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
+ 
+-static void schedule_scx_ops_disable_work(void)
++static noinline void schedule_scx_ops_disable_work(void)
+ {
+ 	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
+ 
+@@ -3125,7 +3125,7 @@ static void schedule_scx_ops_disable_work(void)
+ 		kthread_queue_work(helper, &scx_ops_disable_work);
+ }
+ 
+-static void scx_ops_disable(enum scx_exit_type type)
++static noinline void scx_ops_disable(enum scx_exit_type type)
+ {
+ 	int none = SCX_EXIT_NONE;
+ 
+@@ -3137,7 +3137,7 @@ static void scx_ops_disable(enum scx_exit_type type)
+ 	schedule_scx_ops_disable_work();
+ }
+ 
+-static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
++static noinline void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+ {
+ 	schedule_scx_ops_disable_work();
+ }
+@@ -3163,7 +3163,7 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+ 	irq_work_queue(&scx_ops_error_irq_work);
+ }
+ 
+-static struct kthread_worker *scx_create_rt_helper(const char *name)
++static noinline struct kthread_worker *scx_create_rt_helper(const char *name)
+ {
+ 	struct kthread_worker *helper;
+ 
+@@ -3173,7 +3173,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
+ 	return helper;
+ }
+ 
+-static int scx_ops_enable(struct sched_ext_ops *ops)
++static noinline int scx_ops_enable(struct sched_ext_ops *ops)
+ {
+ 	struct scx_task_iter sti;
+ 	struct task_struct *p;
+@@ -3412,7 +3412,7 @@ static const char *scx_ops_enable_state_str[] = {
+ 	[SCX_OPS_DISABLED]	= "disabled",
+ };
+ 
+-static int scx_debug_show(struct seq_file *m, void *v)
++static noinline int scx_debug_show(struct seq_file *m, void *v)
+ {
+ 	mutex_lock(&scx_ops_enable_mutex);
+ 	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
+@@ -3428,7 +3428,7 @@ static int scx_debug_show(struct seq_file *m, void *v)
+ 	return 0;
+ }
+ 
+-static int scx_debug_open(struct inode *inode, struct file *file)
++static noinline int scx_debug_open(struct inode *inode, struct file *file)
+ {
+ 	return single_open(file, scx_debug_show, NULL);
+ }
+@@ -3451,7 +3451,7 @@ const struct file_operations sched_ext_fops = {
+ extern struct btf *btf_vmlinux;
+ static const struct btf_type *task_struct_type;
+ 
+-static bool bpf_scx_is_valid_access(int off, int size,
++static noinline bool bpf_scx_is_valid_access(int off, int size,
+ 				    enum bpf_access_type type,
+ 				    const struct bpf_prog *prog,
+ 				    struct bpf_insn_access_aux *info)
+@@ -3466,7 +3466,7 @@ static bool bpf_scx_is_valid_access(int off, int size,
+ 	return btf_ctx_access(off, size, type, prog, info);
+ }
+ 
+-static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
++static noinline int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
+ 				     const struct bpf_reg_state *reg, int off,
+ 				     int size)
+ {
+@@ -3488,7 +3488,7 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
+ 	return -EACCES;
+ }
+ 
+-static const struct bpf_func_proto *
++static noinline const struct bpf_func_proto *
+ bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+ {
+ 	switch (func_id) {
+@@ -3507,7 +3507,7 @@ const struct bpf_verifier_ops bpf_scx_verifier_ops = {
+ 	.btf_struct_access = bpf_scx_btf_struct_access,
+ };
+ 
+-static int bpf_scx_init_member(const struct btf_type *t,
++static noinline int bpf_scx_init_member(const struct btf_type *t,
+ 			       const struct btf_member *member,
+ 			       void *kdata, const void *udata)
+ {
+@@ -3545,7 +3545,7 @@ static int bpf_scx_init_member(const struct btf_type *t,
+ 	return 0;
+ }
+ 
+-static int bpf_scx_check_member(const struct btf_type *t,
++static noinline int bpf_scx_check_member(const struct btf_type *t,
+ 				const struct btf_member *member,
+ 				const struct bpf_prog *prog)
+ {
+@@ -3569,18 +3569,18 @@ static int bpf_scx_check_member(const struct btf_type *t,
+ 	return 0;
+ }
+ 
+-static int bpf_scx_reg(void *kdata)
++static noinline int bpf_scx_reg(void *kdata)
+ {
+ 	return scx_ops_enable(kdata);
+ }
+ 
+-static void bpf_scx_unreg(void *kdata)
++static noinline void bpf_scx_unreg(void *kdata)
+ {
+ 	scx_ops_disable(SCX_EXIT_UNREG);
+ 	kthread_flush_work(&scx_ops_disable_work);
+ }
+ 
+-static int bpf_scx_init(struct btf *btf)
++static noinline int bpf_scx_init(struct btf *btf)
+ {
+ 	u32 type_id;
+ 
+@@ -3592,7 +3592,7 @@ static int bpf_scx_init(struct btf *btf)
+ 	return 0;
+ }
+ 
+-static int bpf_scx_update(void *kdata, void *old_kdata)
++static noinline int bpf_scx_update(void *kdata, void *old_kdata)
+ {
+ 	/*
+ 	 * sched_ext does not support updating the actively-loaded BPF
+@@ -3604,7 +3604,7 @@ static int bpf_scx_update(void *kdata, void *old_kdata)
+ 	return -EOPNOTSUPP;
+ }
+ 
+-static int bpf_scx_validate(void *kdata)
++static noinline int bpf_scx_validate(void *kdata)
+ {
+ 	return 0;
+ }
+@@ -3624,7 +3624,7 @@ struct bpf_struct_ops bpf_sched_ext_ops = {
+ 	.name = "sched_ext_ops",
+ };
+ 
+-static void sysrq_handle_sched_ext_reset(int key)
++static noinline void sysrq_handle_sched_ext_reset(int key)
+ {
+ 	if (scx_ops_helper)
+ 		scx_ops_disable(SCX_EXIT_SYSRQ);
+@@ -3639,7 +3639,7 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
+ 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
+ };
+ 
+-static void kick_cpus_irq_workfn(struct irq_work *irq_work)
++static noinline void kick_cpus_irq_workfn(struct irq_work *irq_work)
+ {
+ 	struct rq *this_rq = this_rq();
+ 	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+@@ -3788,7 +3788,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
+ 	.set			= &scx_kfunc_ids_sleepable,
+ };
+ 
+-static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
++static noinline bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+ {
+ 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ 		return false;
+@@ -3808,7 +3808,7 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+ 	return true;
+ }
+ 
+-static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
++static noinline void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
+ {
+ 	struct task_struct *ddsp_task;
+ 	int idx;
diff --git a/scx-event-track.patch b/scx-event-track.patch
new file mode 100644
index 0000000000000..15c435ba23d69
--- /dev/null
+++ b/scx-event-track.patch
@@ -0,0 +1,67 @@
+diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
+index 8a2d8eaefd33..d62b10de9719 100644
+--- a/include/linux/sched/ext.h
++++ b/include/linux/sched/ext.h
+@@ -692,6 +692,9 @@ struct sched_ext_entity {
+ 	 */
+ 	bool			disallow;	/* reject switching into SCX */
+ 
++	u32			enq_seq;
++	u32			deq_seq;
++
+ 	/* cold fields */
+ 	struct list_head	tasks_node;
+ #ifdef CONFIG_EXT_GROUP_SCHED
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 29fcdd00c184..a601a3038456 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4567,6 +4567,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	atomic64_set(&p->scx.ops_state, 0);
+ 	p->scx.runnable_at	= INITIAL_JIFFIES;
+ 	p->scx.slice		= SCX_SLICE_DFL;
++	p->scx.enq_seq		= 0;
++	p->scx.deq_seq		= 0;
+ #endif
+ 
+ #ifdef CONFIG_PREEMPT_NOTIFIERS
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index b7a80233ea08..3ead15a505c4 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -920,6 +920,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
+ {
+ 	int sticky_cpu = p->scx.sticky_cpu;
+ 
++	p->scx.enq_seq++;
++
+ 	enq_flags |= rq->scx.extra_enq_flags;
+ 
+ 	if (sticky_cpu >= 0)
+@@ -935,7 +937,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
+ 		sticky_cpu = cpu_of(rq);
+ 
+ 	if (p->scx.flags & SCX_TASK_QUEUED) {
+-		WARN_ON_ONCE(!watchdog_task_watched(p));
++		if (WARN_ON_ONCE(!watchdog_task_watched(p)))
++			trace_printk("%s[%d] %u:%u WARN\n", p->comm, p->pid, p->scx.enq_seq, p->scx.deq_seq);
+ 		return;
+ 	}
+ 
+@@ -1003,6 +1006,8 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
+ {
+ 	struct scx_rq *scx_rq = &rq->scx;
+ 
++	p->scx.deq_seq++;
++
+ 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+ 		WARN_ON_ONCE(watchdog_task_watched(p));
+ 		return;
+@@ -1720,6 +1725,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
+ 	 * have decided that @p should keep running.
+ 	 */
+ 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
++		trace_printk("%s[%d] %u:%u BAL_KEEP\n", p->comm, p->pid, p->scx.enq_seq, p->scx.deq_seq);
+ 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
+ 		watchdog_watch_task(rq, p);
+ 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);

From 21f4c19e4c4dd0d37552325b1c67856e779c2930 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Sep 2023 14:05:50 -1000
Subject: [PATCH 104/304] scx: Fix p->scx.flags corruption due to
 unsynchronized writes of SCX_TASK_ON_DSQ_PRIQ

p->scx.flag is protected by the task's rq lock but one of the flags,
SCX_TASK_ON_DSQ_PRIQ, is protected by p->dsq->lock, not its rq lock. This
could lead to corruption of p->scx.flags through RMW races triggering
watchdog and other sanity checks. Fix it moving it to its own flag field
p->scx.dsq_flags which is protected by the dsq lock.
---
 include/linux/sched/ext.h | 7 ++++++-
 kernel/sched/ext.c        | 8 ++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 8a2d8eaefd33f..24f74ebeb7af1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -598,7 +598,6 @@ enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
 	SCX_TASK_ENQ_LOCAL	= 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
-	SCX_TASK_ON_DSQ_PRIQ	= 1 << 3, /* task is queued on the priority queue of a dsq */
 
 	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
 	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
@@ -609,6 +608,11 @@ enum scx_ent_flags {
 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
 };
 
+/* scx_entity.dsq_flags */
+enum scx_ent_dsq_flags {
+	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
+};
+
 /*
  * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
  * everywhere and the following bits track which kfunc sets are currently
@@ -646,6 +650,7 @@ struct sched_ext_entity {
 	} dsq_node;
 	struct list_head	watchdog_node;
 	u32			flags;		/* protected by rq lock */
+	u32			dsq_flags;	/* protected by dsq lock */
 	u32			weight;
 	s32			sticky_cpu;
 	s32			holding_cpu;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b7a80233ea089..bbad3a5cb42ab 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -620,7 +620,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
 
 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
-	WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
+	WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
 		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
 
 	if (!is_local) {
@@ -635,7 +635,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	}
 
 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
-		p->scx.flags |= SCX_TASK_ON_DSQ_PRIQ;
+		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
 			      scx_dsq_priq_less);
 	} else {
@@ -675,10 +675,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 static void task_unlink_from_dsq(struct task_struct *p,
 				 struct scx_dispatch_q *dsq)
 {
-	if (p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) {
+	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
 		rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
 		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-		p->scx.flags &= ~SCX_TASK_ON_DSQ_PRIQ;
+		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
 	} else {
 		list_del_init(&p->scx.dsq_node.fifo);
 	}

From 8424909e02885e7f4b55880aeccbb0e4458750c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Sep 2023 14:45:56 -1000
Subject: [PATCH 105/304] xxx

---
 scx-event-track.patch | 205 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 178 insertions(+), 27 deletions(-)

diff --git a/scx-event-track.patch b/scx-event-track.patch
index 15c435ba23d69..f54ded4662314 100644
--- a/scx-event-track.patch
+++ b/scx-event-track.patch
@@ -1,67 +1,218 @@
 diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
-index 8a2d8eaefd33..d62b10de9719 100644
+index 8a2d8eaefd33..9b2c9da34ea4 100644
 --- a/include/linux/sched/ext.h
 +++ b/include/linux/sched/ext.h
-@@ -692,6 +692,9 @@ struct sched_ext_entity {
- 	 */
- 	bool			disallow;	/* reject switching into SCX */
+@@ -634,6 +634,31 @@ enum scx_kf_mask {
+ 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_REST,
+ };
  
-+	u32			enq_seq;
-+	u32			deq_seq;
++enum {
++	NR_DBGEVS = 32
++};
 +
- 	/* cold fields */
- 	struct list_head	tasks_node;
++enum scx_dbgev_kind {
++	DBGEV_NONE,
++	DBGEV_ENQ_TASK,
++	DBGEV_DO_ENQ,
++	DBGEV_DEQ_TASK,
++	DBGEV_OPS_DEQ,
++	DBGEV_WATCH,
++	DBGEV_UNWATCH,
++	DBGEV_PRIQ_LINK,
++	DBGEV_PRIQ_UNLINK,
++};
++
++struct scx_dbgev {
++	u64 at;
++	u32 event;
++	u32 task_flags;
++	u64 ops_state;
++	u64 scx_flags;
++	void *bt[3];
++};
++
+ /*
+  * The following is embedded in task_struct and contains all fields necessary
+  * for a task to be scheduled by SCX.
+@@ -697,6 +722,9 @@ struct sched_ext_entity {
  #ifdef CONFIG_EXT_GROUP_SCHED
+ 	struct cgroup		*cgrp_moving_from;
+ #endif
++
++	struct scx_dbgev	dbgevs[NR_DBGEVS];
++	int dbgev_cursor;
+ };
+ 
+ void sched_ext_free(struct task_struct *p);
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 29fcdd00c184..a601a3038456 100644
+index 29fcdd00c184..264cc795b63e 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -4567,6 +4567,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4567,6 +4567,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  	atomic64_set(&p->scx.ops_state, 0);
  	p->scx.runnable_at	= INITIAL_JIFFIES;
  	p->scx.slice		= SCX_SLICE_DFL;
-+	p->scx.enq_seq		= 0;
-+	p->scx.deq_seq		= 0;
++	{
++		int i;
++		for (i = 0; i < ARRAY_SIZE(p->scx.dbgevs); i++)
++			p->scx.dbgevs[i].event = DBGEV_NONE;
++		p->scx.dbgev_cursor = 0;
++	}
  #endif
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
 diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
-index b7a80233ea08..3ead15a505c4 100644
+index b7a80233ea08..ea92b59ab41a 100644
 --- a/kernel/sched/ext.c
 +++ b/kernel/sched/ext.c
-@@ -920,6 +920,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
+@@ -6,6 +6,58 @@
+  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+  */
++
++static void record_dbgev(struct task_struct *p, u32 dbgev, u64 now)
++{
++	u32 cur = p->scx.dbgev_cursor;
++	struct scx_dbgev *ev = &p->scx.dbgevs[cur];
++
++	p->scx.dbgev_cursor = (cur + 1) % NR_DBGEVS;
++
++	ev->at = now;
++	ev->event = dbgev;
++	ev->task_flags = p->flags;
++	ev->ops_state = p->scx.ops_state.counter;
++	ev->scx_flags = p->scx.flags;
++	ev->bt[0] = __builtin_return_address(0);
++	ev->bt[1] = __builtin_return_address(1);
++	ev->bt[2] = __builtin_return_address(2);
++}
++
++static const char *dbgev_name(u32 event)
++{
++	static const char *names[] = {
++		[DBGEV_NONE] = "NONE",
++		[DBGEV_ENQ_TASK] = "ENQ_TASK",
++		[DBGEV_DO_ENQ] = "DO_ENQ",
++		[DBGEV_DEQ_TASK] = "DEQ_TASK",
++		[DBGEV_OPS_DEQ] = "OPS_DEQ",
++		[DBGEV_WATCH] = "WATCH",
++		[DBGEV_UNWATCH] = "UNWATCH",
++		[DBGEV_PRIQ_LINK] = "PRIQ_LINK",
++		[DBGEV_PRIQ_UNLINK] = "PRIQ_UNLINK",
++	};
++
++	if (event >= ARRAY_SIZE(names) || !names[event])
++		return "UNKNOWN";
++	return names[event];
++}
++
++static void dump_dbgevs(struct task_struct *p)
++{
++	int i;
++
++	for (i = 0; i < NR_DBGEVS; i++) {
++		u32 cur = (p->scx.dbgev_cursor + i) % NR_DBGEVS;
++		struct scx_dbgev *ev = &p->scx.dbgevs[cur];
++
++		trace_printk("DBGEV %llu %-12s t=0x%08x o=0x%08llx s=0x%08llx %pS:%pS:%pS\n",
++			     ev->at / 1000, dbgev_name(ev->event),
++			     ev->task_flags, ev->ops_state, ev->scx_flags,
++			     ev->bt[0], ev->bt[1], ev->bt[2]);
++	}
++}
++
+ #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+ 
+ enum scx_internal_consts {
+@@ -620,8 +672,9 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
+ 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
+ 
+ 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
+-	WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
+-		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
++	if (WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
++			 !RB_EMPTY_NODE(&p->scx.dsq_node.priq)))
++		dump_dbgevs(p);
+ 
+ 	if (!is_local) {
+ 		raw_spin_lock(&dsq->lock);
+@@ -636,6 +689,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
+ 
+ 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+ 		p->scx.flags |= SCX_TASK_ON_DSQ_PRIQ;
++		record_dbgev(p, DBGEV_PRIQ_LINK, rq_clock_task(task_rq(p)));
+ 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
+ 			      scx_dsq_priq_less);
+ 	} else {
+@@ -678,6 +732,7 @@ static void task_unlink_from_dsq(struct task_struct *p,
+ 	if (p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) {
+ 		rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
+ 		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
++		record_dbgev(p, DBGEV_PRIQ_UNLINK, rq_clock_task(task_rq(p)));
+ 		p->scx.flags &= ~SCX_TASK_ON_DSQ_PRIQ;
+ 	} else {
+ 		list_del_init(&p->scx.dsq_node.fifo);
+@@ -820,6 +875,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+ 	struct task_struct **ddsp_taskp;
+ 	unsigned long qseq;
+ 
++	record_dbgev(p, DBGEV_DO_ENQ, rq_clock_task(rq));
++
+ 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+ 
+ 	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
+@@ -902,6 +959,8 @@ static bool watchdog_task_watched(const struct task_struct *p)
+ 
+ static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+ {
++	record_dbgev(p, DBGEV_WATCH, rq_clock_task(rq));
++
+ 	lockdep_assert_rq_held(rq);
+ 	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
+ 		p->scx.runnable_at = jiffies;
+@@ -911,6 +970,8 @@ static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+ 
+ static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
+ {
++	record_dbgev(p, DBGEV_UNWATCH, rq_clock_task(task_rq(p)));
++
+ 	list_del_init(&p->scx.watchdog_node);
+ 	if (reset_timeout)
+ 		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+@@ -920,6 +981,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
  {
  	int sticky_cpu = p->scx.sticky_cpu;
  
-+	p->scx.enq_seq++;
++	record_dbgev(p, DBGEV_ENQ_TASK, rq_clock_task(rq));
 +
  	enq_flags |= rq->scx.extra_enq_flags;
  
  	if (sticky_cpu >= 0)
-@@ -935,7 +937,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
+@@ -935,7 +998,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
  		sticky_cpu = cpu_of(rq);
  
  	if (p->scx.flags & SCX_TASK_QUEUED) {
 -		WARN_ON_ONCE(!watchdog_task_watched(p));
 +		if (WARN_ON_ONCE(!watchdog_task_watched(p)))
-+			trace_printk("%s[%d] %u:%u WARN\n", p->comm, p->pid, p->scx.enq_seq, p->scx.deq_seq);
++			dump_dbgevs(p);
  		return;
  	}
  
-@@ -1003,6 +1006,8 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
+@@ -957,6 +1021,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+ {
+ 	unsigned long opss;
+ 
++	record_dbgev(p, DBGEV_OPS_DEQ, rq_clock_task(task_rq(p)));
++
+ 	watchdog_unwatch_task(p, false);
+ 
+ 	/* acquire ensures that we see the preceding updates on QUEUED */
+@@ -1003,6 +1069,8 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
  {
  	struct scx_rq *scx_rq = &rq->scx;
  
-+	p->scx.deq_seq++;
++	record_dbgev(p, DBGEV_DEQ_TASK, rq_clock_task(rq));
 +
  	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
  		WARN_ON_ONCE(watchdog_task_watched(p));
  		return;
-@@ -1720,6 +1725,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
- 	 * have decided that @p should keep running.
- 	 */
- 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
-+		trace_printk("%s[%d] %u:%u BAL_KEEP\n", p->comm, p->pid, p->scx.enq_seq, p->scx.deq_seq);
- 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
- 		watchdog_watch_task(rq, p);
- 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);

From be81498d88fbcc1dd0b71f87254f846c453e8785 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Sep 2023 11:46:44 -1000
Subject: [PATCH 106/304] scx_rusty: Keep .bpf.o files for debugging

---
 tools/sched_ext/scx_rusty/build.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index d47a754514ada..2385e7e6f040f 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -46,8 +46,10 @@ fn gen_bpf_sched(name: &str) {
     let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
     let skel = Path::new(&outpath);
     let src = format!("./src/bpf/{}.bpf.c", name);
+    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
     SkeletonBuilder::new()
         .source(src.clone())
+	.obj(obj)
         .clang(clang)
         .clang_args(bpf_cflags)
         .build_and_generate(skel)

From 997c4506ef5fdbe861eba4147ad335201e4660d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Sep 2023 10:16:52 -1000
Subject: [PATCH 107/304] Revert "Merge pull request #48 from
 sched-ext/rusty-keep-bpf-o"

This reverts commit 664d65059cf5dbecc5049cab619c57be08f94e52, reversing
changes made to ee9077a8f2560e17f737557778eb487d9a688c06.
---
 noinline-everything.patch          | 720 -----------------------------
 scx-event-track.patch              | 218 ---------
 tools/sched_ext/scx_rusty/build.rs |   2 -
 3 files changed, 940 deletions(-)
 delete mode 100644 noinline-everything.patch
 delete mode 100644 scx-event-track.patch

diff --git a/noinline-everything.patch b/noinline-everything.patch
deleted file mode 100644
index 882349fd77071..0000000000000
--- a/noinline-everything.patch
+++ /dev/null
@@ -1,720 +0,0 @@
-diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
-index b7a80233ea08..b9f6e50aaafe 100644
---- a/kernel/sched/ext.c
-+++ b/kernel/sched/ext.c
-@@ -814,8 +814,8 @@ static bool test_rq_online(struct rq *rq)
- #endif
- }
- 
--static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
--			    int sticky_cpu)
-+static noinline void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
-+				     int sticky_cpu)
- {
- 	struct task_struct **ddsp_taskp;
- 	unsigned long qseq;
-@@ -895,12 +895,12 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
- 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
- }
- 
--static bool watchdog_task_watched(const struct task_struct *p)
-+static noinline bool watchdog_task_watched(const struct task_struct *p)
- {
- 	return !list_empty(&p->scx.watchdog_node);
- }
- 
--static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
-+static noinline void watchdog_watch_task(struct rq *rq, struct task_struct *p)
- {
- 	lockdep_assert_rq_held(rq);
- 	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
-@@ -909,14 +909,14 @@ static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
- 	list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
- }
- 
--static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
-+static noinline void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
- {
- 	list_del_init(&p->scx.watchdog_node);
- 	if (reset_timeout)
- 		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
- }
- 
--static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
-+static noinline void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
- {
- 	int sticky_cpu = p->scx.sticky_cpu;
- 
-@@ -953,7 +953,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
- 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
- }
- 
--static void ops_dequeue(struct task_struct *p, u64 deq_flags)
-+static noinline void ops_dequeue(struct task_struct *p, u64 deq_flags)
- {
- 	unsigned long opss;
- 
-@@ -999,7 +999,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
- 	}
- }
- 
--static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
-+static noinline void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
- {
- 	struct scx_rq *scx_rq = &rq->scx;
- 
-@@ -1042,7 +1042,7 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
- 	dispatch_dequeue(scx_rq, p);
- }
- 
--static void yield_task_scx(struct rq *rq)
-+static noinline void yield_task_scx(struct rq *rq)
- {
- 	struct task_struct *p = rq->curr;
- 
-@@ -1052,7 +1052,7 @@ static void yield_task_scx(struct rq *rq)
- 		p->scx.slice = 0;
- }
- 
--static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
-+static noinline bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
- {
- 	struct task_struct *from = rq->curr;
- 
-@@ -1087,8 +1087,8 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
-  * Returns %true if @p was successfully moved. %false after racing dequeue and
-  * losing.
-  */
--static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
--				   u64 enq_flags)
-+static noinline bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
-+					    u64 enq_flags)
- {
- 	struct rq *task_rq;
- 
-@@ -1142,8 +1142,8 @@ static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
-  * @rq stays locked isn't important as long as the state is restored after
-  * dispatch_to_local_dsq_unlock().
-  */
--static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
--				       struct rq *src_rq, struct rq *dst_rq)
-+static noinline void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
-+						struct rq *src_rq, struct rq *dst_rq)
- {
- 	rq_unpin_lock(rq, rf);
- 
-@@ -1171,8 +1171,8 @@ static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
-  *
-  * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
-  */
--static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
--					 struct rq *src_rq, struct rq *dst_rq)
-+static noinline void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
-+						  struct rq *src_rq, struct rq *dst_rq)
- {
- 	if (src_rq == dst_rq) {
- 		raw_spin_rq_unlock(dst_rq);
-@@ -1191,14 +1191,14 @@ static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
- #endif	/* CONFIG_SMP */
- 
- 
--static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
-+static noinline bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
- {
- 	return likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
- 		cpumask_test_cpu(cpu_of(rq), p->cpus_ptr);
- }
- 
--static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
--			       struct scx_dispatch_q *dsq)
-+static noinline bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
-+					struct scx_dispatch_q *dsq)
- {
- 	struct scx_rq *scx_rq = &rq->scx;
- 	struct task_struct *p;
-@@ -1293,7 +1293,7 @@ enum dispatch_to_local_dsq_ret {
-  * The caller must have exclusive ownership of @p (e.g. through
-  * %SCX_OPSS_DISPATCHING).
-  */
--static enum dispatch_to_local_dsq_ret
-+static noinline enum dispatch_to_local_dsq_ret
- dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
- 		      struct task_struct *p, u64 enq_flags)
- {
-@@ -1412,7 +1412,7 @@ dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
-  * was valid in the first place. Make sure that the task is still owned by the
-  * BPF scheduler and claim the ownership before dispatching.
-  */
--static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
-+static noinline void finish_dispatch(struct rq *rq, struct rq_flags *rf,
- 			    struct task_struct *p,
- 			    unsigned long qseq_at_dispatch,
- 			    u64 dsq_id, u64 enq_flags)
-@@ -1482,7 +1482,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
- 	}
- }
- 
--static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
-+static noinline void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
- {
- 	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
- 	u32 u;
-@@ -1498,7 +1498,7 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
- 	dspc->buf_cursor = 0;
- }
- 
--static int balance_one(struct rq *rq, struct task_struct *prev,
-+static noinline int balance_one(struct rq *rq, struct task_struct *prev,
- 		       struct rq_flags *rf, bool local)
- {
- 	struct scx_rq *scx_rq = &rq->scx;
-@@ -1600,7 +1600,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
- 	return 0;
- }
- 
--static int balance_scx(struct rq *rq, struct task_struct *prev,
-+static noinline int balance_scx(struct rq *rq, struct task_struct *prev,
- 		       struct rq_flags *rf)
- {
- 	int ret;
-@@ -1642,7 +1642,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
- 	return ret;
- }
- 
--static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
-+static noinline void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
- {
- 	if (p->scx.flags & SCX_TASK_QUEUED) {
- 		/*
-@@ -1676,7 +1676,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
- 	}
- }
- 
--static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
-+static noinline void put_prev_task_scx(struct rq *rq, struct task_struct *p)
- {
- #ifndef CONFIG_SMP
- 	/*
-@@ -1756,7 +1756,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
- 	}
- }
- 
--static struct task_struct *first_local_task(struct rq *rq)
-+static noinline struct task_struct *first_local_task(struct rq *rq)
- {
- 	struct rb_node *rb_node;
- 
-@@ -1772,7 +1772,7 @@ static struct task_struct *first_local_task(struct rq *rq)
- 	return NULL;
- }
- 
--static struct task_struct *pick_next_task_scx(struct rq *rq)
-+static noinline struct task_struct *pick_next_task_scx(struct rq *rq)
- {
- 	struct task_struct *p;
- 
-@@ -1846,7 +1846,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
-  * at the first task in the local dsq. @rq->curr has to be considered explicitly
-  * to mimic %SCX_TASK_BAL_KEEP.
-  */
--static struct task_struct *pick_task_scx(struct rq *rq)
-+static noinline struct task_struct *pick_task_scx(struct rq *rq)
- {
- 	struct task_struct *curr = rq->curr;
- 	struct task_struct *first = first_local_task(rq);
-@@ -1878,7 +1878,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
- }
- #endif	/* CONFIG_SCHED_CORE */
- 
--static enum scx_cpu_preempt_reason
-+static noinline enum scx_cpu_preempt_reason
- preempt_reason_from_class(const struct sched_class *class)
- {
- #ifdef CONFIG_SMP
-@@ -1932,7 +1932,7 @@ void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
- 
- #ifdef CONFIG_SMP
- 
--static bool test_and_clear_cpu_idle(int cpu)
-+static noinline bool test_and_clear_cpu_idle(int cpu)
- {
- #ifdef CONFIG_SCHED_SMT
- 	/*
-@@ -1958,7 +1958,7 @@ static bool test_and_clear_cpu_idle(int cpu)
- 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
- }
- 
--static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-+static noinline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
- {
- 	int cpu;
- 
-@@ -1983,7 +1983,7 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
- 		goto retry;
- }
- 
--static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
-+static noinline s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
- {
- 	s32 cpu;
- 
-@@ -2040,7 +2040,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
- 	return prev_cpu;
- }
- 
--static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
-+static noinline int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
- {
- 	if (SCX_HAS_OP(select_cpu)) {
- 		s32 cpu;
-@@ -2058,7 +2058,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
- 	}
- }
- 
--static void set_cpus_allowed_scx(struct task_struct *p,
-+static noinline void set_cpus_allowed_scx(struct task_struct *p,
- 				 struct affinity_context *ac)
- {
- 	set_cpus_allowed_common(p, ac);
-@@ -2076,7 +2076,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
- 				 (struct cpumask *)p->cpus_ptr);
- }
- 
--static void reset_idle_masks(void)
-+static noinline void reset_idle_masks(void)
- {
- 	/* consider all cpus idle, should converge to the actual state quickly */
- 	cpumask_setall(idle_masks.cpu);
-@@ -2119,13 +2119,13 @@ void __scx_update_idle(struct rq *rq, bool idle)
- #endif
- }
- 
--static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
-+static noinline void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG)
- 		SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq));
- }
- 
--static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
-+static noinline void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG)
- 		SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq));
-@@ -2133,13 +2133,13 @@ static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
- 
- #else /* !CONFIG_SMP */
- 
--static bool test_and_clear_cpu_idle(int cpu) { return false; }
--static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
--static void reset_idle_masks(void) {}
-+static noinline bool test_and_clear_cpu_idle(int cpu) { return false; }
-+static noinline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-+static noinline void reset_idle_masks(void) {}
- 
- #endif /* CONFIG_SMP */
- 
--static bool check_rq_for_timeouts(struct rq *rq)
-+static noinline bool check_rq_for_timeouts(struct rq *rq)
- {
- 	struct task_struct *p;
- 	struct rq_flags rf;
-@@ -2166,7 +2166,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
- 	return timed_out;
- }
- 
--static void scx_watchdog_workfn(struct work_struct *work)
-+static noinline void scx_watchdog_workfn(struct work_struct *work)
- {
- 	int cpu;
- 
-@@ -2182,7 +2182,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
- 			   scx_watchdog_timeout / 2);
- }
- 
--static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
-+static noinline void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
- {
- 	update_curr_scx(rq);
- 
-@@ -2200,7 +2200,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
- }
- 
- #ifdef CONFIG_EXT_GROUP_SCHED
--static struct cgroup *tg_cgrp(struct task_group *tg)
-+static noinline struct cgroup *tg_cgrp(struct task_group *tg)
- {
- 	/*
- 	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
-@@ -2221,7 +2221,7 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
- 
- #endif	/* CONFIG_EXT_GROUP_SCHED */
- 
--static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
-+static noinline int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
- {
- 	int ret;
- 
-@@ -2266,7 +2266,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
- 	return 0;
- }
- 
--static void scx_ops_enable_task(struct task_struct *p)
-+static noinline void scx_ops_enable_task(struct task_struct *p)
- {
- 	lockdep_assert_rq_held(task_rq(p));
- 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
-@@ -2281,7 +2281,7 @@ static void scx_ops_enable_task(struct task_struct *p)
- 	p->scx.flags |= SCX_TASK_OPS_ENABLED;
- }
- 
--static void scx_ops_disable_task(struct task_struct *p)
-+static noinline void scx_ops_disable_task(struct task_struct *p)
- {
- 	lockdep_assert_rq_held(task_rq(p));
- 
-@@ -2300,7 +2300,7 @@ static void scx_ops_disable_task(struct task_struct *p)
- 	}
- }
- 
--static void set_task_scx_weight(struct task_struct *p)
-+static noinline void set_task_scx_weight(struct task_struct *p)
- {
- 	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
- 
-@@ -2317,7 +2317,7 @@ static void set_task_scx_weight(struct task_struct *p)
-  * created, priority is changed for a task on sched_ext, and a task is switched
-  * to sched_ext from other classes.
-  */
--static void refresh_scx_weight(struct task_struct *p)
-+static noinline void refresh_scx_weight(struct task_struct *p)
- {
- 	lockdep_assert_rq_held(task_rq(p));
- 	set_task_scx_weight(p);
-@@ -2402,16 +2402,16 @@ void sched_ext_free(struct task_struct *p)
- 	}
- }
- 
--static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
-+static noinline void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
- {
- 	refresh_scx_weight(p);
- }
- 
--static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
-+static noinline void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
- {
- }
- 
--static void switching_to_scx(struct rq *rq, struct task_struct *p)
-+static noinline void switching_to_scx(struct rq *rq, struct task_struct *p)
- {
- 	refresh_scx_weight(p);
- 
-@@ -2424,8 +2424,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
- 				 (struct cpumask *)p->cpus_ptr);
- }
- 
--static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
--static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
-+static noinline void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
-+static noinline void switched_to_scx(struct rq *rq, struct task_struct *p) {}
- 
- int scx_check_setscheduler(struct task_struct *p, int policy)
- {
-@@ -2602,12 +2602,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
- 	percpu_up_read(&scx_cgroup_rwsem);
- }
- 
--static void scx_cgroup_lock(void)
-+static noinline void scx_cgroup_lock(void)
- {
- 	percpu_down_write(&scx_cgroup_rwsem);
- }
- 
--static void scx_cgroup_unlock(void)
-+static noinline void scx_cgroup_unlock(void)
- {
- 	percpu_up_write(&scx_cgroup_rwsem);
- }
-@@ -2674,7 +2674,7 @@ DEFINE_SCHED_CLASS(ext) = {
- #endif
- };
- 
--static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
-+static noinline void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
- {
- 	memset(dsq, 0, sizeof(*dsq));
- 
-@@ -2683,7 +2683,7 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
- 	dsq->id = dsq_id;
- }
- 
--static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-+static noinline struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
- {
- 	struct scx_dispatch_q *dsq;
- 	int ret;
-@@ -2706,7 +2706,7 @@ static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
- 	return dsq;
- }
- 
--static void free_dsq_irq_workfn(struct irq_work *irq_work)
-+static noinline void free_dsq_irq_workfn(struct irq_work *irq_work)
- {
- 	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
- 	struct scx_dispatch_q *dsq, *tmp_dsq;
-@@ -2717,7 +2717,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
- 
- static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
- 
--static void destroy_dsq(u64 dsq_id)
-+static noinline void destroy_dsq(u64 dsq_id)
- {
- 	struct scx_dispatch_q *dsq;
- 	unsigned long flags;
-@@ -2756,7 +2756,7 @@ static void destroy_dsq(u64 dsq_id)
- }
- 
- #ifdef CONFIG_EXT_GROUP_SCHED
--static void scx_cgroup_exit(void)
-+static noinline void scx_cgroup_exit(void)
- {
- 	struct cgroup_subsys_state *css;
- 
-@@ -2789,7 +2789,7 @@ static void scx_cgroup_exit(void)
- 	rcu_read_unlock();
- }
- 
--static int scx_cgroup_init(void)
-+static noinline int scx_cgroup_init(void)
- {
- 	struct cgroup_subsys_state *css;
- 	int ret;
-@@ -2834,7 +2834,7 @@ static int scx_cgroup_init(void)
- 	return 0;
- }
- 
--static void scx_cgroup_config_knobs(void)
-+static noinline void scx_cgroup_config_knobs(void)
- {
- 	static DEFINE_MUTEX(cgintf_mutex);
- 	DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { };
-@@ -2875,9 +2875,9 @@ static void scx_cgroup_config_knobs(void)
- }
- 
- #else
--static void scx_cgroup_exit(void) {}
--static int scx_cgroup_init(void) { return 0; }
--static void scx_cgroup_config_knobs(void) {}
-+static noinline void scx_cgroup_exit(void) {}
-+static noinline int scx_cgroup_init(void) { return 0; }
-+static noinline void scx_cgroup_config_knobs(void) {}
- #endif
- 
- /*
-@@ -2893,7 +2893,7 @@ bool task_should_scx(struct task_struct *p)
- 	return p->policy == SCHED_EXT;
- }
- 
--static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
-+static noinline void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
- {
- 	if (enq_flags & SCX_ENQ_LAST)
- 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-@@ -2901,9 +2901,9 @@ static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
- 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
- }
- 
--static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
-+static noinline void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
- 
--static void scx_ops_disable_workfn(struct kthread_work *work)
-+static noinline void scx_ops_disable_workfn(struct kthread_work *work)
- {
- 	struct scx_exit_info *ei = &scx_exit_info;
- 	struct scx_task_iter sti;
-@@ -3113,7 +3113,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
- 
- static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
- 
--static void schedule_scx_ops_disable_work(void)
-+static noinline void schedule_scx_ops_disable_work(void)
- {
- 	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
- 
-@@ -3125,7 +3125,7 @@ static void schedule_scx_ops_disable_work(void)
- 		kthread_queue_work(helper, &scx_ops_disable_work);
- }
- 
--static void scx_ops_disable(enum scx_exit_type type)
-+static noinline void scx_ops_disable(enum scx_exit_type type)
- {
- 	int none = SCX_EXIT_NONE;
- 
-@@ -3137,7 +3137,7 @@ static void scx_ops_disable(enum scx_exit_type type)
- 	schedule_scx_ops_disable_work();
- }
- 
--static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
-+static noinline void scx_ops_error_irq_workfn(struct irq_work *irq_work)
- {
- 	schedule_scx_ops_disable_work();
- }
-@@ -3163,7 +3163,7 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
- 	irq_work_queue(&scx_ops_error_irq_work);
- }
- 
--static struct kthread_worker *scx_create_rt_helper(const char *name)
-+static noinline struct kthread_worker *scx_create_rt_helper(const char *name)
- {
- 	struct kthread_worker *helper;
- 
-@@ -3173,7 +3173,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
- 	return helper;
- }
- 
--static int scx_ops_enable(struct sched_ext_ops *ops)
-+static noinline int scx_ops_enable(struct sched_ext_ops *ops)
- {
- 	struct scx_task_iter sti;
- 	struct task_struct *p;
-@@ -3412,7 +3412,7 @@ static const char *scx_ops_enable_state_str[] = {
- 	[SCX_OPS_DISABLED]	= "disabled",
- };
- 
--static int scx_debug_show(struct seq_file *m, void *v)
-+static noinline int scx_debug_show(struct seq_file *m, void *v)
- {
- 	mutex_lock(&scx_ops_enable_mutex);
- 	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
-@@ -3428,7 +3428,7 @@ static int scx_debug_show(struct seq_file *m, void *v)
- 	return 0;
- }
- 
--static int scx_debug_open(struct inode *inode, struct file *file)
-+static noinline int scx_debug_open(struct inode *inode, struct file *file)
- {
- 	return single_open(file, scx_debug_show, NULL);
- }
-@@ -3451,7 +3451,7 @@ const struct file_operations sched_ext_fops = {
- extern struct btf *btf_vmlinux;
- static const struct btf_type *task_struct_type;
- 
--static bool bpf_scx_is_valid_access(int off, int size,
-+static noinline bool bpf_scx_is_valid_access(int off, int size,
- 				    enum bpf_access_type type,
- 				    const struct bpf_prog *prog,
- 				    struct bpf_insn_access_aux *info)
-@@ -3466,7 +3466,7 @@ static bool bpf_scx_is_valid_access(int off, int size,
- 	return btf_ctx_access(off, size, type, prog, info);
- }
- 
--static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
-+static noinline int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
- 				     const struct bpf_reg_state *reg, int off,
- 				     int size)
- {
-@@ -3488,7 +3488,7 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
- 	return -EACCES;
- }
- 
--static const struct bpf_func_proto *
-+static noinline const struct bpf_func_proto *
- bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
- {
- 	switch (func_id) {
-@@ -3507,7 +3507,7 @@ const struct bpf_verifier_ops bpf_scx_verifier_ops = {
- 	.btf_struct_access = bpf_scx_btf_struct_access,
- };
- 
--static int bpf_scx_init_member(const struct btf_type *t,
-+static noinline int bpf_scx_init_member(const struct btf_type *t,
- 			       const struct btf_member *member,
- 			       void *kdata, const void *udata)
- {
-@@ -3545,7 +3545,7 @@ static int bpf_scx_init_member(const struct btf_type *t,
- 	return 0;
- }
- 
--static int bpf_scx_check_member(const struct btf_type *t,
-+static noinline int bpf_scx_check_member(const struct btf_type *t,
- 				const struct btf_member *member,
- 				const struct bpf_prog *prog)
- {
-@@ -3569,18 +3569,18 @@ static int bpf_scx_check_member(const struct btf_type *t,
- 	return 0;
- }
- 
--static int bpf_scx_reg(void *kdata)
-+static noinline int bpf_scx_reg(void *kdata)
- {
- 	return scx_ops_enable(kdata);
- }
- 
--static void bpf_scx_unreg(void *kdata)
-+static noinline void bpf_scx_unreg(void *kdata)
- {
- 	scx_ops_disable(SCX_EXIT_UNREG);
- 	kthread_flush_work(&scx_ops_disable_work);
- }
- 
--static int bpf_scx_init(struct btf *btf)
-+static noinline int bpf_scx_init(struct btf *btf)
- {
- 	u32 type_id;
- 
-@@ -3592,7 +3592,7 @@ static int bpf_scx_init(struct btf *btf)
- 	return 0;
- }
- 
--static int bpf_scx_update(void *kdata, void *old_kdata)
-+static noinline int bpf_scx_update(void *kdata, void *old_kdata)
- {
- 	/*
- 	 * sched_ext does not support updating the actively-loaded BPF
-@@ -3604,7 +3604,7 @@ static int bpf_scx_update(void *kdata, void *old_kdata)
- 	return -EOPNOTSUPP;
- }
- 
--static int bpf_scx_validate(void *kdata)
-+static noinline int bpf_scx_validate(void *kdata)
- {
- 	return 0;
- }
-@@ -3624,7 +3624,7 @@ struct bpf_struct_ops bpf_sched_ext_ops = {
- 	.name = "sched_ext_ops",
- };
- 
--static void sysrq_handle_sched_ext_reset(int key)
-+static noinline void sysrq_handle_sched_ext_reset(int key)
- {
- 	if (scx_ops_helper)
- 		scx_ops_disable(SCX_EXIT_SYSRQ);
-@@ -3639,7 +3639,7 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
- 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
- };
- 
--static void kick_cpus_irq_workfn(struct irq_work *irq_work)
-+static noinline void kick_cpus_irq_workfn(struct irq_work *irq_work)
- {
- 	struct rq *this_rq = this_rq();
- 	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
-@@ -3788,7 +3788,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
- 	.set			= &scx_kfunc_ids_sleepable,
- };
- 
--static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
-+static noinline bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
- {
- 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
- 		return false;
-@@ -3808,7 +3808,7 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
- 	return true;
- }
- 
--static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
-+static noinline void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
- {
- 	struct task_struct *ddsp_task;
- 	int idx;
diff --git a/scx-event-track.patch b/scx-event-track.patch
deleted file mode 100644
index f54ded4662314..0000000000000
--- a/scx-event-track.patch
+++ /dev/null
@@ -1,218 +0,0 @@
-diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
-index 8a2d8eaefd33..9b2c9da34ea4 100644
---- a/include/linux/sched/ext.h
-+++ b/include/linux/sched/ext.h
-@@ -634,6 +634,31 @@ enum scx_kf_mask {
- 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_REST,
- };
- 
-+enum {
-+	NR_DBGEVS = 32
-+};
-+
-+enum scx_dbgev_kind {
-+	DBGEV_NONE,
-+	DBGEV_ENQ_TASK,
-+	DBGEV_DO_ENQ,
-+	DBGEV_DEQ_TASK,
-+	DBGEV_OPS_DEQ,
-+	DBGEV_WATCH,
-+	DBGEV_UNWATCH,
-+	DBGEV_PRIQ_LINK,
-+	DBGEV_PRIQ_UNLINK,
-+};
-+
-+struct scx_dbgev {
-+	u64 at;
-+	u32 event;
-+	u32 task_flags;
-+	u64 ops_state;
-+	u64 scx_flags;
-+	void *bt[3];
-+};
-+
- /*
-  * The following is embedded in task_struct and contains all fields necessary
-  * for a task to be scheduled by SCX.
-@@ -697,6 +722,9 @@ struct sched_ext_entity {
- #ifdef CONFIG_EXT_GROUP_SCHED
- 	struct cgroup		*cgrp_moving_from;
- #endif
-+
-+	struct scx_dbgev	dbgevs[NR_DBGEVS];
-+	int dbgev_cursor;
- };
- 
- void sched_ext_free(struct task_struct *p);
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 29fcdd00c184..264cc795b63e 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4567,6 +4567,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	atomic64_set(&p->scx.ops_state, 0);
- 	p->scx.runnable_at	= INITIAL_JIFFIES;
- 	p->scx.slice		= SCX_SLICE_DFL;
-+	{
-+		int i;
-+		for (i = 0; i < ARRAY_SIZE(p->scx.dbgevs); i++)
-+			p->scx.dbgevs[i].event = DBGEV_NONE;
-+		p->scx.dbgev_cursor = 0;
-+	}
- #endif
- 
- #ifdef CONFIG_PREEMPT_NOTIFIERS
-diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
-index b7a80233ea08..ea92b59ab41a 100644
---- a/kernel/sched/ext.c
-+++ b/kernel/sched/ext.c
-@@ -6,6 +6,58 @@
-  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-  */
-+
-+static void record_dbgev(struct task_struct *p, u32 dbgev, u64 now)
-+{
-+	u32 cur = p->scx.dbgev_cursor;
-+	struct scx_dbgev *ev = &p->scx.dbgevs[cur];
-+
-+	p->scx.dbgev_cursor = (cur + 1) % NR_DBGEVS;
-+
-+	ev->at = now;
-+	ev->event = dbgev;
-+	ev->task_flags = p->flags;
-+	ev->ops_state = p->scx.ops_state.counter;
-+	ev->scx_flags = p->scx.flags;
-+	ev->bt[0] = __builtin_return_address(0);
-+	ev->bt[1] = __builtin_return_address(1);
-+	ev->bt[2] = __builtin_return_address(2);
-+}
-+
-+static const char *dbgev_name(u32 event)
-+{
-+	static const char *names[] = {
-+		[DBGEV_NONE] = "NONE",
-+		[DBGEV_ENQ_TASK] = "ENQ_TASK",
-+		[DBGEV_DO_ENQ] = "DO_ENQ",
-+		[DBGEV_DEQ_TASK] = "DEQ_TASK",
-+		[DBGEV_OPS_DEQ] = "OPS_DEQ",
-+		[DBGEV_WATCH] = "WATCH",
-+		[DBGEV_UNWATCH] = "UNWATCH",
-+		[DBGEV_PRIQ_LINK] = "PRIQ_LINK",
-+		[DBGEV_PRIQ_UNLINK] = "PRIQ_UNLINK",
-+	};
-+
-+	if (event >= ARRAY_SIZE(names) || !names[event])
-+		return "UNKNOWN";
-+	return names[event];
-+}
-+
-+static void dump_dbgevs(struct task_struct *p)
-+{
-+	int i;
-+
-+	for (i = 0; i < NR_DBGEVS; i++) {
-+		u32 cur = (p->scx.dbgev_cursor + i) % NR_DBGEVS;
-+		struct scx_dbgev *ev = &p->scx.dbgevs[cur];
-+
-+		trace_printk("DBGEV %llu %-12s t=0x%08x o=0x%08llx s=0x%08llx %pS:%pS:%pS\n",
-+			     ev->at / 1000, dbgev_name(ev->event),
-+			     ev->task_flags, ev->ops_state, ev->scx_flags,
-+			     ev->bt[0], ev->bt[1], ev->bt[2]);
-+	}
-+}
-+
- #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
- 
- enum scx_internal_consts {
-@@ -620,8 +672,9 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
- 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
- 
- 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
--	WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
--		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
-+	if (WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
-+			 !RB_EMPTY_NODE(&p->scx.dsq_node.priq)))
-+		dump_dbgevs(p);
- 
- 	if (!is_local) {
- 		raw_spin_lock(&dsq->lock);
-@@ -636,6 +689,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
- 
- 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
- 		p->scx.flags |= SCX_TASK_ON_DSQ_PRIQ;
-+		record_dbgev(p, DBGEV_PRIQ_LINK, rq_clock_task(task_rq(p)));
- 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
- 			      scx_dsq_priq_less);
- 	} else {
-@@ -678,6 +732,7 @@ static void task_unlink_from_dsq(struct task_struct *p,
- 	if (p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) {
- 		rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
- 		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-+		record_dbgev(p, DBGEV_PRIQ_UNLINK, rq_clock_task(task_rq(p)));
- 		p->scx.flags &= ~SCX_TASK_ON_DSQ_PRIQ;
- 	} else {
- 		list_del_init(&p->scx.dsq_node.fifo);
-@@ -820,6 +875,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
- 	struct task_struct **ddsp_taskp;
- 	unsigned long qseq;
- 
-+	record_dbgev(p, DBGEV_DO_ENQ, rq_clock_task(rq));
-+
- 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
- 
- 	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
-@@ -902,6 +959,8 @@ static bool watchdog_task_watched(const struct task_struct *p)
- 
- static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
- {
-+	record_dbgev(p, DBGEV_WATCH, rq_clock_task(rq));
-+
- 	lockdep_assert_rq_held(rq);
- 	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
- 		p->scx.runnable_at = jiffies;
-@@ -911,6 +970,8 @@ static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
- 
- static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
- {
-+	record_dbgev(p, DBGEV_UNWATCH, rq_clock_task(task_rq(p)));
-+
- 	list_del_init(&p->scx.watchdog_node);
- 	if (reset_timeout)
- 		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
-@@ -920,6 +981,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
- {
- 	int sticky_cpu = p->scx.sticky_cpu;
- 
-+	record_dbgev(p, DBGEV_ENQ_TASK, rq_clock_task(rq));
-+
- 	enq_flags |= rq->scx.extra_enq_flags;
- 
- 	if (sticky_cpu >= 0)
-@@ -935,7 +998,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
- 		sticky_cpu = cpu_of(rq);
- 
- 	if (p->scx.flags & SCX_TASK_QUEUED) {
--		WARN_ON_ONCE(!watchdog_task_watched(p));
-+		if (WARN_ON_ONCE(!watchdog_task_watched(p)))
-+			dump_dbgevs(p);
- 		return;
- 	}
- 
-@@ -957,6 +1021,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
- {
- 	unsigned long opss;
- 
-+	record_dbgev(p, DBGEV_OPS_DEQ, rq_clock_task(task_rq(p)));
-+
- 	watchdog_unwatch_task(p, false);
- 
- 	/* acquire ensures that we see the preceding updates on QUEUED */
-@@ -1003,6 +1069,8 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
- {
- 	struct scx_rq *scx_rq = &rq->scx;
- 
-+	record_dbgev(p, DBGEV_DEQ_TASK, rq_clock_task(rq));
-+
- 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
- 		WARN_ON_ONCE(watchdog_task_watched(p));
- 		return;
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index 2385e7e6f040f..d47a754514ada 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -46,10 +46,8 @@ fn gen_bpf_sched(name: &str) {
     let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
     let skel = Path::new(&outpath);
     let src = format!("./src/bpf/{}.bpf.c", name);
-    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
     SkeletonBuilder::new()
         .source(src.clone())
-	.obj(obj)
         .clang(clang)
         .clang_args(bpf_cflags)
         .build_and_generate(skel)

From 258510ec62a726d464bb2b01c4774b44e70eda6f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Sep 2023 11:46:44 -1000
Subject: [PATCH 108/304] scx_rusty: Keep .bpf.o files for debugging

(cherry picked from commit be81498d88fbcc1dd0b71f87254f846c453e8785)
---
 tools/sched_ext/scx_rusty/build.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index d47a754514ada..2385e7e6f040f 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -46,8 +46,10 @@ fn gen_bpf_sched(name: &str) {
     let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
     let skel = Path::new(&outpath);
     let src = format!("./src/bpf/{}.bpf.c", name);
+    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
     SkeletonBuilder::new()
         .source(src.clone())
+	.obj(obj)
         .clang(clang)
         .clang_args(bpf_cflags)
         .build_and_generate(skel)

From c70e7d396776d8e8bc8b1bc24c991c23c64862dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 26 Sep 2023 09:54:14 -1000
Subject: [PATCH 109/304] rusty: Don't use bpf_cpumask_full() to set
 task_ctx->all_cpus

Instead, collect all per-dom cpumasks into all_cpumask and test whether
that's a subset of a task's cpumask. bpf_cpumask_full() can incorrectly
indicate that a task's affinity is restricted when it's not depending on the
machine configuration.
---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 59adec80ed29d..436297e6dcac9 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -155,6 +155,7 @@ struct tune_input{
 } tune_input;
 
 __u64 tune_params_gen;
+private(A) struct bpf_cpumask __kptr *all_cpumask;
 private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask;
 private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask;
 
@@ -790,7 +791,8 @@ void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
 	}
 
 	task_pick_and_set_domain(task_ctx, p, cpumask, false);
-	task_ctx->all_cpus = bpf_cpumask_full(cpumask);
+	if (all_cpumask)
+		task_ctx->all_cpus = bpf_cpumask_subset(all_cpumask, cpumask);
 }
 
 s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
@@ -890,8 +892,14 @@ static s32 create_dom(u32 dom_id)
 			return -ENOENT;
 		}
 
-		if (*dmask & (1LLU << (cpu % 64)))
+		if (*dmask & (1LLU << (cpu % 64))) {
 			bpf_cpumask_set_cpu(cpu, cpumask);
+
+			bpf_rcu_read_lock();
+			if (all_cpumask)
+				bpf_cpumask_set_cpu(cpu, all_cpumask);
+			bpf_rcu_read_unlock();
+		}
 	}
 
 	cpumask = bpf_kptr_xchg(&domc->cpumask, cpumask);
@@ -924,17 +932,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
 	struct bpf_cpumask *cpumask;
 	s32 i, ret;
 
-	if (!switch_partial)
-		scx_bpf_switch_all();
-
-	bpf_for(i, 0, nr_doms) {
-		ret = create_dom(i);
-		if (ret)
-			return ret;
-	}
-
-	bpf_for(i, 0, nr_cpus)
-		pcpu_ctx[i].dom_rr_cur = i;
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
 
 	cpumask = bpf_cpumask_create();
 	if (!cpumask)
@@ -950,6 +953,18 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
 	if (cpumask)
 		bpf_cpumask_release(cpumask);
 
+	if (!switch_partial)
+		scx_bpf_switch_all();
+
+	bpf_for(i, 0, nr_doms) {
+		ret = create_dom(i);
+		if (ret)
+			return ret;
+	}
+
+	bpf_for(i, 0, nr_cpus)
+		pcpu_ctx[i].dom_rr_cur = i;
+
 	return 0;
 }
 

From f4fd473fb3205f07b961e4e272f619f7f5940d0f Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 2 Oct 2023 12:04:19 -0500
Subject: [PATCH 110/304] central: Allow specifying the slice length in the
 central scheduler

Researchers at Inria-Paris are experimenting with the central
scheduler, and want to try setting different slice lengths to see how
they affect performance for VMs running the NAS benchmarks. Let's make
this convenient by allowing it to be passed as a parameter from user
space.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_central.bpf.c | 3 ++-
 tools/sched_ext/scx_central.c     | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 67e6412bd5d83..87a28092b65f3 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -58,6 +58,7 @@ enum {
 const volatile bool switch_partial;
 const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
+const volatile u64 slice_ns = SCX_SLICE_DFL;
 
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
 u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
@@ -263,7 +264,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 		/* kick iff the current one exhausted its slice */
 		started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
 		if (started_at && *started_at &&
-		    vtime_before(now, *started_at + SCX_SLICE_DFL))
+		    vtime_before(now, *started_at + slice_ns))
 			continue;
 
 		/* and there's something pending */
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 580d4b50172fa..7b95efd9fff5e 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -18,8 +18,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-c CPU] [-p]\n"
+"Usage: %s [-s SLICE_US] [-c CPU] [-p]\n"
 "\n"
+"  -s SLICE_US   Override slice duration\n"
 "  -c CPU        Override the central CPU (default: 0)\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";
@@ -49,8 +50,11 @@ int main(int argc, char **argv)
 	skel->rodata->central_cpu = 0;
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
 
-	while ((opt = getopt(argc, argv, "c:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "s:c:ph")) != -1) {
 		switch (opt) {
+		case 's':
+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
 		case 'c':
 			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
 			break;

From 3e74dbdffdb45caca472bc3c5ba0a12c92347056 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 2 Oct 2023 18:25:08 -0500
Subject: [PATCH 111/304] central: Pin timer callbacks to central CPU

The scx_central scheduler specifies an infinite slice for all cores
other than a "central" core where scheduling decisions are made. This
scheduler currently suffers from the fact that the BPF timer may be
invoked on a different core than the central scheduler, due to BPF
timers not supporting being pinned to specific CPUs.

That capability was proposed upstream for BPF in [0]. If and when it
lands, we would need to invoke bpf_timer_start() from the core that we
want the timer pinned to, because the API does not support specifying a
core to have the timer invoked from. To accommodate this, we can
affinitize the loading thread to the central CPU before loading the
scheduler, and then pin from there.

[0]: https://lore.kernel.org/bpf/20231002234708.331192-2-void@manifault.com/T/

Though the BPF timer pinning feature has not yet landed, we can still
set the stage for leveraging it by adding the logic to affinitize the
loading thread to the central CPU. While we won't yet have a guarantee
that the timer will be pinned to the same core throughout the runtime
of the scheduler, in practice, it seems that affinitizing in this manner
does make it very likely regardless. In addition, the user space
component of the central scheduler doesn't benefit from running on a
tickless core, so keeping it affinitized to the central CPU avoids it
from preempting a task on a tickless core that would otherwise benefit
from less preemption.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_central.bpf.c |  3 +++
 tools/sched_ext/scx_central.c     | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 87a28092b65f3..0eef2fe4bd731 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -303,6 +303,9 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 	if (!timer)
 		return -ESRCH;
 
+	if (bpf_get_smp_processor_id() != central_cpu)
+		return -EINVAL;
+
 	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
 	bpf_timer_set_callback(timer, central_timerfn);
 	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 7b95efd9fff5e..b77a15ac6f5b9 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -4,6 +4,8 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
+#define _GNU_SOURCE
+#include <sched.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
@@ -38,6 +40,7 @@ int main(int argc, char **argv)
 	struct bpf_link *link;
 	__u64 seq = 0;
 	__s32 opt;
+	cpu_set_t *cpuset;
 
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
@@ -73,6 +76,26 @@ int main(int argc, char **argv)
 
 	SCX_BUG_ON(scx_central__load(skel), "Failed to load skel");
 
+	/*
+	 * Affinitize the loading thread to the central CPU, as:
+	 * - That's where the BPF timer is first invoked in the BPF program.
+	 * - We probably don't want this user space component to take up a core
+	 *   from a task that would benefit from avoiding preemption on one of
+	 *   the tickless cores.
+	 *
+	 * Until BPF supports pinning the timer, it's not guaranteed that it
+	 * will always be invoked on the central CPU. In practice, this
+	 * suffices the majority of the time.
+	 */
+	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
+	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
+	CPU_ZERO(cpuset);
+	CPU_SET(skel->rodata->central_cpu, cpuset);
+	SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
+		   "Failed to affinitize to central CPU %d (max %d)",
+		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
+	CPU_FREE(cpuset);
+
 	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
 	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 

From 7b1ca1978b88c66faf4407e0352c31d5ddb9409c Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 3 Oct 2023 15:17:08 -0500
Subject: [PATCH 112/304] scx: Fix typo in tickless comment

There's a comment that says can_stop_tick_scx(). The function is
scx_can_stop_tick().

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bbad3a5cb42ab..c221ee3efaf5c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1663,7 +1663,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	/*
 	 * @p is getting newly scheduled or got kicked after someone updated its
-	 * slice. Refresh whether tick can be stopped. See can_stop_tick_scx().
+	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
 	 */
 	if ((p->scx.slice == SCX_SLICE_INF) !=
 	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {

From 101c601e33095bf6c3c05d07fd53c434bd439f20 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 6 Oct 2023 20:53:15 -0500
Subject: [PATCH 113/304] scx: Add missing piece

Forgot to git add a small conflict to resolve

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/sched.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e1c62c7d0e6e4..0b33d01178ca9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3607,7 +3607,9 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
 static inline void init_sched_mm_cid(struct task_struct *t) { }
 #endif
 
-<<<<<<< HEAD
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 #ifdef CONFIG_CGROUP_SCHED
 enum cpu_cftype_id {
 #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
@@ -3629,9 +3631,6 @@ enum cpu_cftype_id {
 extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
 #endif /* CONFIG_CGROUP_SCHED */
 
-extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
-extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
-
 #include "ext.h"
 
 #endif /* _KERNEL_SCHED_SCHED_H */

From 88a818f58b0a287e18ac76c71127d054b5cc6781 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 9 Oct 2023 10:08:38 -0500
Subject: [PATCH 114/304] central: Pin timer to the central CPU

In commit d6247ecb6c1e ("bpf: Add ability to pin bpf timer to calling
CPU"), BPF added the ability to be able to pin a BPF timer to the
calling CPU. Let's use this capability from the central scheduler.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_central.bpf.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 0eef2fe4bd731..de05779619878 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -252,7 +252,14 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
 	u64 now = bpf_ktime_get_ns();
 	u64 nr_to_kick = nr_queued;
-	s32 i;
+	s32 i, curr_cpu;
+
+	curr_cpu = bpf_get_smp_processor_id();
+	if (curr_cpu != central_cpu) {
+		scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
+			      curr_cpu, central_cpu);
+		return 0;
+	}
 
 	bpf_for(i, 0, nr_cpu_ids) {
 		s32 cpu = (nr_timers + i) % nr_cpu_ids;
@@ -279,9 +286,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
 	}
 
-	scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-
-	bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
 	__sync_fetch_and_add(&nr_timers, 1);
 	return 0;
 }
@@ -308,7 +313,7 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 
 	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
 	bpf_timer_set_callback(timer, central_timerfn);
-	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
 	return ret;
 }
 

From 3b2417571681049b127e3bf57d5a04ee2ba6af3e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 11 Oct 2023 17:19:23 -0500
Subject: [PATCH 115/304] scx: Refactor and clean up build system

The current scx build system is a bit hacky. We put some build artifacts
in a tools/ directory, and others (skel files and .bpf.o files) we leave
in the current directory. This isn't conducive to environments that want
to package sched_ext schedulers. This patch therefore updates the
Makefile to have the build put all build artifacts (including the
compiled binaries for the schedulers into an build/ directory
(previously tools/). All artifacts will be deployed as follows:

build/bin: Compiled binaries (e.g. scx_simple, scx_central, etc)
build/sbin: Compiled binaries that are used as part of the build
            process, e.g. bpftool
build/include: Headers that are visible from .c files
build/obj: Contains object files and libraries that are used as part of
           the build process
build/obj/bpftool: Build artifacts from compiling bpftool from source
build/obj/libbpf: Build artifacts from compiling libbpf from source
build/obj/sched_ext: Build artifacts from compiling and linking BPF
                     programs and their user space counterparts.
build/release: Build output from Cargo for Rust schedulers

This patch also adds the following enhancement:

- Support for changing the build directory output by specifying the O
  environment variable, as in:

$ O=/tmp/sched_ext make CC=clang LLVM=1 -j

to output all artifacts for that build job to /tmp/sched_ext/build

- Removing code duplication by defining a ccsched make function for
  compiling schedulers, and an $(SCX_COMMON_DEPS) variable for common
  dependencies.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 112 +++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 53 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 8ad8e186aefa9..026ce49249a73 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -47,23 +47,29 @@ APIDIR := $(TOOLSINCDIR)/uapi
 GENDIR := $(abspath ../../include/generated)
 GENHDR := $(GENDIR)/autoconf.h
 
-SCRATCH_DIR := $(CURDIR)/tools
-BUILD_DIR := $(SCRATCH_DIR)/build
-INCLUDE_DIR := $(SCRATCH_DIR)/include
-BPFOBJ_DIR := $(BUILD_DIR)/libbpf
+ifeq ($(O),)
+OUTPUT_DIR := $(CURDIR)/build
+else
+OUTPUT_DIR := $(O)/build
+endif # O
+OBJ_DIR := $(OUTPUT_DIR)/obj
+INCLUDE_DIR := $(OUTPUT_DIR)/include
+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
+BINDIR := $(OUTPUT_DIR)/bin
 BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
 ifneq ($(CROSS_COMPILE),)
-HOST_BUILD_DIR		:= $(BUILD_DIR)/host
-HOST_SCRATCH_DIR	:= host-tools
-HOST_INCLUDE_DIR	:= $(HOST_SCRATCH_DIR)/include
+HOST_BUILD_DIR		:= $(OBJ_DIR)/host
+HOST_OUTPUT_DIR	:= host-tools
+HOST_INCLUDE_DIR	:= $(HOST_OUTPUT_DIR)/include
 else
-HOST_BUILD_DIR		:= $(BUILD_DIR)
-HOST_SCRATCH_DIR	:= $(SCRATCH_DIR)
+HOST_BUILD_DIR		:= $(OBJ_DIR)
+HOST_OUTPUT_DIR	:= $(OUTPUT_DIR)
 HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
 endif
 HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
 RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
-DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool
 
 VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
 		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
@@ -85,7 +91,7 @@ CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
 	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
 	  -I$(TOOLSINCDIR) -I$(APIDIR)
 
-CARGOFLAGS := --release
+CARGOFLAGS := --release --target-dir $(OUTPUT_DIR)
 
 # Silence some warnings when compiled with clang
 ifneq ($(LLVM),)
@@ -120,9 +126,9 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_rusty
 
 # sort removes libbpf duplicates when not cross-building
-MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
 	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
-	       $(INCLUDE_DIR))
+	       $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
 
 $(MAKE_DIRS):
 	$(call msg,MKDIR,,$@)
@@ -130,10 +136,10 @@ $(MAKE_DIRS):
 
 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
 	   $(APIDIR)/linux/bpf.h						\
-	   | $(BUILD_DIR)/libbpf
-	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/	\
+	   | $(OBJ_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
 		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
-		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
 
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
 		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
@@ -142,8 +148,8 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
 		    EXTRA_CFLAGS='-g -O0'					\
 		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
 		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
-		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/				\
-		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
+		    LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/				\
+		    prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
 
 $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
 ifeq ($(VMLINUX_H),)
@@ -155,53 +161,53 @@ else
 endif
 
 %.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
-	| $(BPFOBJ)
+	| $(BPFOBJ) $(SCXOBJ_DIR)
 	$(call msg,CLNG-BPF,,$@)
-	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $(SCXOBJ_DIR)/$@
 
 %.skel.h: %.bpf.o $(BPFTOOL)
 	$(call msg,GEN-SKEL,,$@)
-	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
-	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
-	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
-	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
-	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $@
-	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $(@:.skel.h=.subskel.h)
-
-scx_simple: scx_simple.c scx_simple.skel.h user_exit_info.h scx_user_common.h
-	$(CC) $(CFLAGS) -c $< -o $@.o
-	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
-
-scx_qmap: scx_qmap.c scx_qmap.skel.h user_exit_info.h scx_user_common.h
-	$(CC) $(CFLAGS) -c $< -o $@.o
-	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
-
-scx_central: scx_central.c scx_central.skel.h user_exit_info.h scx_user_common.h
-	$(CC) $(CFLAGS) -c $< -o $@.o
-	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
-
-scx_pair: scx_pair.c scx_pair.skel.h user_exit_info.h scx_user_common.h
-	$(CC) $(CFLAGS) -c $< -o $@.o
-	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
-
-scx_flatcg: scx_flatcg.c scx_flatcg.skel.h user_exit_info.h scx_user_common.h
-	$(CC) $(CFLAGS) -c $< -o $@.o
-	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
-
-scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h user_exit_info.h \
-	      scx_user_common.h
-	$(CC) $(CFLAGS) -c $< -o $@.o
-	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+	$(Q)$(BPFTOOL) gen object $(SCXOBJ_DIR)/$(<:.o=.linked1.o) $(SCXOBJ_DIR)/$<
+	$(Q)$(BPFTOOL) gen object $(SCXOBJ_DIR)/$(<:.o=.linked2.o) $(SCXOBJ_DIR)/$(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(SCXOBJ_DIR)/$(<:.o=.linked3.o) $(SCXOBJ_DIR)/$(<:.o=.linked2.o)
+	$(Q)diff $(SCXOBJ_DIR)/$(<:.o=.linked2.o) $(SCXOBJ_DIR)/$(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(SCXOBJ_DIR)/$(<:.o=.linked3.o) name $(<:.bpf.o=) > $(INCLUDE_DIR)/$@
+	$(Q)$(BPFTOOL) gen subskeleton $(SCXOBJ_DIR)/$(<:.o=.linked3.o) name $(<:.bpf.o=) > $(INCLUDE_DIR)/$(@:.skel.h=.subskel.h)
+
+define ccsched
+	$(CC) $(CFLAGS) -c $(1) -o $(SCXOBJ_DIR)/$(2).o
+	$(CC) -o $(BINDIR)/$(2) $(SCXOBJ_DIR)/$(2).o $(HOST_BPFOBJ) $(LDFLAGS)
+endef
+
+SCX_COMMON_DEPS := user_exit_info.h scx_user_common.h | $(BINDIR)
+scx_simple: scx_simple.c scx_simple.skel.h $(SCX_COMMON_DEPS)
+	$(call ccsched,$<,$@)
+
+scx_qmap: scx_qmap.c scx_qmap.skel.h $(SCX_COMMON_DEPS)
+	$(call ccsched,$<,$@)
+
+scx_central: scx_central.c scx_central.skel.h $(SCX_COMMON_DEPS)
+	$(call ccsched,$<,$@)
+
+scx_pair: scx_pair.c scx_pair.skel.h $(SCX_COMMON_DEPS)
+	$(call ccsched,$<,$@)
+
+scx_flatcg: scx_flatcg.c scx_flatcg.skel.h $(SCX_COMMON_DEPS)
+	$(call ccsched,$<,$@)
+
+scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h $(SCX_COMMON_DEPS)
+	$(call ccsched,$<,$@)
 
 scx_rusty: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
 scx_rusty: export SCX_RUSTY_CLANG = $(CLANG)
 scx_rusty: export SCX_RUSTY_BPF_CFLAGS = $(BPF_CFLAGS)
-scx_rusty: $(INCLUDE_DIR)/vmlinux.h
+scx_rusty: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
 	cargo build --manifest-path=scx_rusty/Cargo.toml $(CARGOFLAGS)
+	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
 
 clean:
 	cargo clean --manifest-path=scx_rusty/Cargo.toml
-	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
+	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
 

From ac229e5fb30d5f1602a7476757a5e3b233ac5aaa Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 11 Oct 2023 18:02:24 -0500
Subject: [PATCH 116/304] scx: Add make install target for installing
 schedulers

Another requirement of packaging systems is to be able to install
compiled schedulers in some reachable PATH endpoint so they can be
accessed easily. This patch adds a new install target in Make for this,
which installs the schedulers on the system at /usr/bin. The user also
has the option of specifying DESTDIR to indicate a prefix of /usr/bin.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 026ce49249a73..dc2ff555d1bd8 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -205,6 +205,10 @@ scx_rusty: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
 	cargo build --manifest-path=scx_rusty/Cargo.toml $(CARGOFLAGS)
 	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
 
+install: all
+	$(Q)mkdir -p $(DESTDIR)/usr/bin/
+	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/bin/
+
 clean:
 	cargo clean --manifest-path=scx_rusty/Cargo.toml
 	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)

From 7fc318482ec063eba13568a5452d18834f6d8491 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 11 Oct 2023 18:18:03 -0500
Subject: [PATCH 117/304] scx: Add Make help target for explaining build
 options

It's mostly self evident, but now that we support environment variables
to dictate build behavior, we should document them in a clean and easy
to consume way.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index dc2ff555d1bd8..7d177acdec15c 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -215,7 +215,44 @@ clean:
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
 
-.PHONY: all scx_rusty clean
+help:
+	@echo  'Building targets:'
+	@echo  '  all		  - Compile all schedulers'
+	@echo  ''
+	@echo  'Alternatively, you may compile individual schedulers:'
+	@echo  '  scx_simple'
+	@echo  '  scx_qmap'
+	@echo  '  scx_central'
+	@echo  '  scx_pair'
+	@echo  '  scx_flatcg'
+	@echo  '  scx_userland'
+	@echo  '  scx_rusty'
+	@echo  ''
+	@echo  'For any scheduler build target, you may specify an alternative'
+	@echo  'build output path with the O= environment variable. For example:'
+	@echo  ''
+	@echo  '   O=/tmp/sched_ext make all'
+	@echo  ''
+	@echo  'will compile all schedulers, and emit the build artifacts to'
+	@echo  '/tmp/sched_ext/build.'
+	@echo  ''
+	@echo  ''
+	@echo  'Installing targets:'
+	@echo  '  install	  - Compile and install all schedulers to /usr/bin.'
+	@echo  '		    You may specify the DESTDIR= environment variable'
+	@echo  '		    to indicate a prefix for /usr/bin. For example:'
+	@echo  ''
+	@echo  '   DESTDIR=/tmp/sched_ext make install'
+	@echo  ''
+	@echo  '		    will build the schedulers in CWD/build, and'
+	@echo  '		    install the schedulers to /tmp/sched_ext/usr/bin.'
+	@echo  ''
+	@echo  ''
+	@echo  'Cleaning targets:'
+	@echo  '  clean		  - Remove all generated files, including intermediate'
+	@echo  '                    rust files for rust schedulers.'
+
+.PHONY: all scx_rusty clean help
 
 # delete failed targets
 .DELETE_ON_ERROR:

From 8180b1b7e87128381041e1514f9a2723bafbd4b6 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 12 Oct 2023 10:46:10 -0500
Subject: [PATCH 118/304] rusty: Support downloading rusty deps in separate
 build step

Cargo supports the cargo fetch command to fetch dependencies via the
network before compiling with cargo build. Let's put it into a separate
Makefile target so that packaging systems can separate steps that
require network access from just building.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 7d177acdec15c..fc2dca246d721 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -198,11 +198,13 @@ scx_flatcg: scx_flatcg.c scx_flatcg.skel.h $(SCX_COMMON_DEPS)
 scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h $(SCX_COMMON_DEPS)
 	$(call ccsched,$<,$@)
 
+scx_rusty_deps: $(SCX_COMMON_DEPS)
+	cargo fetch --manifest-path=scx_rusty/Cargo.toml
 scx_rusty: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
 scx_rusty: export SCX_RUSTY_CLANG = $(CLANG)
 scx_rusty: export SCX_RUSTY_BPF_CFLAGS = $(BPF_CFLAGS)
-scx_rusty: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
-	cargo build --manifest-path=scx_rusty/Cargo.toml $(CARGOFLAGS)
+scx_rusty: $(INCLUDE_DIR)/vmlinux.h scx_rusty_deps
+	cargo build --manifest-path=$@/Cargo.toml --offline $(CARGOFLAGS)
 	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
 
 install: all

From 8ce9d1ef9ab7070d7610a9efa3e3ddef9379afba Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 12 Oct 2023 11:42:49 -0500
Subject: [PATCH 119/304] scx: Don't specify nightly rustup as dependency

We were previously under the impression that the rustup nightly
toolchain was required to build the schedulers. Daan pointed out in [0]
that he was able to build with stable, and I similarly was able to build
with rust stable 1.70.0. Let's update the README accordingly.

[0]: https://github.com/sched-ext/sched_ext/issues/57

We also update the README to not explicitly require compiling the
schedulers with

$ make LLVM=1 CC=clang

The BPF schedulers are automatically compiled with clang. If you compile
this way, the user space portions will be compiled with gcc, which is
fine.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/README | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/README b/tools/sched_ext/README
index ac326560ab2c8..c125a66efdb8a 100644
--- a/tools/sched_ext/README
+++ b/tools/sched_ext/README
@@ -37,11 +37,11 @@ is actively working on adding a BPF backend compiler as well, but are still
 missing some features such as BTF type tags which are necessary for using
 kptrs.
 
-2. rustup nightly
+2. rust >= 1.70.0
 
-Rusty's user space load balancing component is written in Rust, and uses
-nightly features. You'll need to use the nightly build from rustup in order to
-compile it.
+scx_rusty's user space load balancing component is written in Rust, and uses
+features present in the rust toolchain >= 1.70.0. You should be able to use the
+stable build from rustup.
 
 There are other requirements as well, such as make, but these are the main /
 non-trivial ones.
@@ -49,9 +49,9 @@ non-trivial ones.
 Compiling the schedulers
 ------------------------
 
-Once you have your toolchain setup, you can compile the schedulers as follows:
+Once you have your toolchain setup, you can compile the schedulers using make:
 
-$ make CC=clang LLVM=1 -j
+$ make -j
 
 See Documentation/scheduler/sched-ext.rst for a description of the config
 options required to compile a sched_ext kernel.

From 38ad0e8ecfca1fbf3bd8a2fd80aaad5e41bc5b9b Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 16 Oct 2023 10:27:06 -0500
Subject: [PATCH 120/304] rusty: Further tweak build system

We previously separated the scx_rusty build into two steps -- a step to
download dependencies, and another to build. That mostly works, except
that the download-dependencies step is always run before the build step
as it's a dependency. Even though it doesn't download any cargo
dependencies, it still accesses the network.

Let's add a way for builders to pass --offline to cargo via a
CARGO_OFFLINE make variable so that we don't need scx_rusty_deps to be a
dependency of scx_rusty.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index fc2dca246d721..f1d718c2ff1ce 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -92,6 +92,9 @@ CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
 	  -I$(TOOLSINCDIR) -I$(APIDIR)
 
 CARGOFLAGS := --release --target-dir $(OUTPUT_DIR)
+ifneq ($(CARGO_OFFLINE),)
+CARGOFLAGS += --offline
+endif
 
 # Silence some warnings when compiled with clang
 ifneq ($(LLVM),)
@@ -198,13 +201,21 @@ scx_flatcg: scx_flatcg.c scx_flatcg.skel.h $(SCX_COMMON_DEPS)
 scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h $(SCX_COMMON_DEPS)
 	$(call ccsched,$<,$@)
 
-scx_rusty_deps: $(SCX_COMMON_DEPS)
+# Separate build target that is available for build systems to use to fetch
+# dependencies in a separate step from building. This allows the scheduler
+# to be compiled without network access.
+#
+# If the scx_rusty Make target is invoked without CARGO_OFFLINE=1 (e.g. if
+# building locally), then cargo build will download all of the necessary
+# dependencies, and scx_rusty_deps can be skipped.
+scx_rusty_deps:
 	cargo fetch --manifest-path=scx_rusty/Cargo.toml
+
 scx_rusty: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
 scx_rusty: export SCX_RUSTY_CLANG = $(CLANG)
 scx_rusty: export SCX_RUSTY_BPF_CFLAGS = $(BPF_CFLAGS)
-scx_rusty: $(INCLUDE_DIR)/vmlinux.h scx_rusty_deps
-	cargo build --manifest-path=$@/Cargo.toml --offline $(CARGOFLAGS)
+scx_rusty: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
+	cargo build --manifest-path=$@/Cargo.toml $(CARGOFLAGS)
 	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
 
 install: all
@@ -239,6 +250,24 @@ help:
 	@echo  '/tmp/sched_ext/build.'
 	@echo  ''
 	@echo  ''
+	@echo  'Rust schedulers:'
+	@echo  '  scx_rusty	  - Build the scx_rusty load balancing scheduler.'
+	@echo  '  scx_rusty_deps  - Download the scx_rusty scheduler cargo dependencies.'
+	@echo  ''
+	@echo  'For any cargo rust schedulers built with cargo, you can specify'
+	@echo  'CARGO_OFFLINE=1 to ensure the build portion does not access the'
+	@echo  'network (e.g. if the scheduler is being packaged).'
+	@echo  ''
+	@echo  'For such use cases, the build workflow will look something like this:'
+	@echo  ''
+	@echo  '   make scx_rusty_deps'
+	@echo  '   CARGO_OFFLINE=1 make scx_rusty'
+	@echo  ''
+	@echo  'If network access during build is allowed, you can just make scx_rusty'
+	@echo  'directly without CARGO_OFFLINE, and dependencies will be downloaded'
+	@echo  'during the build step.'
+	@echo  ''
+	@echo  ''
 	@echo  'Installing targets:'
 	@echo  '  install	  - Compile and install all schedulers to /usr/bin.'
 	@echo  '		    You may specify the DESTDIR= environment variable'

From bd8d7d2e0040ab6d2346413b4f5c748e8aac3a54 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 30 Oct 2023 16:25:53 -0500
Subject: [PATCH 121/304] scx: Improve example schedulers README file

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Kconfig               |   9 +
 tools/sched_ext/{README => README.md} | 296 +++++++++++++++++++-------
 2 files changed, 225 insertions(+), 80 deletions(-)
 create mode 100644 tools/sched_ext/Kconfig
 rename tools/sched_ext/{README => README.md} (51%)

diff --git a/tools/sched_ext/Kconfig b/tools/sched_ext/Kconfig
new file mode 100644
index 0000000000000..6543fcf199f6e
--- /dev/null
+++ b/tools/sched_ext/Kconfig
@@ -0,0 +1,9 @@
+CONFIG_BPF=y
+CONFIG_SCHED_CLASS_EXT=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_DEBUG_INFO_BTF=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT_DEFAULT_ON=y
+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
+CONFIG_PAHOLE_HAS_BTF_TAG=y
diff --git a/tools/sched_ext/README b/tools/sched_ext/README.md
similarity index 51%
rename from tools/sched_ext/README
rename to tools/sched_ext/README.md
index c125a66efdb8a..c5031d073a491 100644
--- a/tools/sched_ext/README
+++ b/tools/sched_ext/README.md
@@ -1,9 +1,7 @@
-		============================
-		SCHED_EXT EXAMPLE SCHEDULERS
-		============================
+SCHED_EXT EXAMPLE SCHEDULERS
+============================
 
-Introduction
-============
+# Introduction
 
 This directory contains a number of example sched_ext schedulers. These
 schedulers are meant to provide examples of different types of schedulers
@@ -22,13 +20,11 @@ not they're production ready. For more details on any of these schedulers,
 please see the header comment in their .bpf.c file.
 
 
-Compiling the examples
-======================
+# Compiling the examples
 
 There are a few toolchain dependencies for compiling the example schedulers.
 
-Toolchain dependencies
-----------------------
+## Toolchain dependencies
 
 1. clang >= 16.0.0
 
@@ -41,78 +37,177 @@ kptrs.
 
 scx_rusty's user space load balancing component is written in Rust, and uses
 features present in the rust toolchain >= 1.70.0. You should be able to use the
-stable build from rustup.
+stable build from rustup, but if that doesn't work, try using the rustup
+nightly build.
 
 There are other requirements as well, such as make, but these are the main /
 non-trivial ones.
 
-Compiling the schedulers
-------------------------
-
-Once you have your toolchain setup, you can compile the schedulers using make:
-
-$ make -j
-
-See Documentation/scheduler/sched-ext.rst for a description of the config
-options required to compile a sched_ext kernel.
-
-Schedulers
-==========
+3. pahole >= 1.25
+
+You may need pahole in order to generate BTF from DWARF.
+
+## Compiling the kernel
+
+In order to run a sched_ext scheduler, you'll have to run a kernel compiled
+with the patches in this repository, and with a minimum set of necessary
+Kconfig options:
+
+```
+CONFIG_BPF=y
+CONFIG_SCHED_CLASS_EXT=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_DEBUG_INFO_BTF=y
+```
+
+It's also recommended that you also include the following Kconfig options:
+
+```
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT_DEFAULT_ON=y
+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
+CONFIG_PAHOLE_HAS_BTF_TAG=y
+```
+
+There is a `Kconfig` file in this directory whose contents you can append to
+your local `.config` file, as long as there are no conflicts with any existing
+options in the file.
+
+## Getting a vmlinux.h file
+
+You may notice that most of the example schedulers include a "vmlinux.h" file.
+This is a large, auto-generated header file that contains all of the types
+defined in some vmlinux binary that was compiled with
+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig
+options specified above).
+
+The header file is created using `bpftool`, by passing it a vmlinux binary
+compiled with BTF as follows:
+
+```bash
+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h
+```
+
+`bpftool` analyzes all of the BTF encodings in the binary, and produces a
+header file that can be included by BPF programs to access those types.  For
+example, using vmlinux.h allows a scheduler to access fields defined directly
+in vmlinux as follows:
+
+```c
+#include "vmlinux.h"
+// vmlinux.h is also implicitly included by scx_common.bpf.h.
+#include "scx_common.bpf.h"
+
+/*
+ * vmlinux.h provides definitions for struct task_struct and
+ * struct scx_enable_args.
+ */
+void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
+		    struct scx_enable_args *args)
+{
+	bpf_printk("Task %s enabled in example scheduler", p->comm);
+}
+
+// vmlinux.h provides the definition for struct sched_ext_ops.
+SEC(".struct_ops.link")
+struct sched_ext_ops example_ops {
+	.enable	= (void *)example_enable,			
+	.name	= "example",
+}
+```
+
+The scheduler build system will generate this vmlinux.h file as part of the
+scheduler build pipeline. It looks for a vmlinux file in the following
+dependency order:
+
+1. If the O= environment variable is defined, at `$O/vmlinux`
+2. If the KBUILD_OUTPUT= environment variable is defined, at
+   `$KBUILD_OUTPUT/vmlinux`
+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're
+   compiling the schedulers)
+3. `/sys/kernel/btf/vmlinux`
+4. `/boot/vmlinux-$(uname -r)`
+
+In other words, if you have compiled a kernel in your local repo, its vmlinux
+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of
+the kernel you're currently running on. This means that if you're running on a
+kernel with sched_ext support, you may not need to compile a local kernel at
+all.
+
+### Aside on CO-RE
+
+One of the cooler features of BPF is that it supports
+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run
+Everywhere). This feature allows you to reference fields inside of structs with
+types defined internal to the kernel, and not have to recompile if you load the
+BPF program on a different kernel with the field at a different offset. In our
+example above, we print out a task name with `p->comm`. CO-RE would perform
+relocations for that access when the program is loaded to ensure that it's
+referencing the correct offset for the currently running kernel.
+
+## Compiling the schedulers
+
+Once you have your toolchain setup, and a vmlinux that can be used to generate
+a full vmlinux.h file, you can compile the schedulers using `make`:
+
+```bash
+$ make -j($nproc)
+```
+
+# Schedulers
 
 This section lists, in alphabetical order, all of the current example
 schedulers.
 
 --------------------------------------------------------------------------------
 
-Rusty
------
+## Rusty
 
-Overview
-~~~~~~~~
+### Overview
 
 A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
 scheduler does a simple round robin in each domain, and the user space portion
 (written in Rust) calculates the load factor of each domain, and informs BPF of
 how tasks should be load balanced accordingly.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 Rusty is designed to be flexible, and accommodate different architectures and
 workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
 as well as how Rusty should partition the system into scheduling domains, can
 be tuned to achieve the optimal configuration for any given system or workload.
 
-Production Ready?
-~~~~~~~~~~~~~~~~~
+### Production Ready?
 
-Yes. If tuned correctly, Rusty should be performant across various CPU
+Yes. If tuned correctly, rusty should be performant across various CPU
 architectures and workloads. Rusty by default creates a separate scheduling
 domain per-LLC, so its default configuration may be performant as well.
 
+That said, you may run into an issue with infeasible weights, where a task with
+a very high weight may cause the scheduler to incorrectly leave cores idle
+because it thinks they're necessary to accommodate the compute for a single
+task. This can also happen in CFS, and should soon be addressed for rusty.
+
 --------------------------------------------------------------------------------
 
-scx_central
------------
+## scx_central
 
-Overview
-~~~~~~~~
+### Overview
 
 A "central" scheduler where scheduling decisions are made from a single CPU.
 This scheduler illustrates how scheduling decisions can be dispatched from a
 single CPU, allowing other cores to run with infinite slices, without timer
 ticks, and without having to incur the overhead of making scheduling decisions.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 This scheduler could theoretically be useful for any workload that benefits
 from minimizing scheduling overhead and timer ticks. An example of where this
 could be particularly useful is running VMs, where running with infinite slices
 and no timer ticks allows the VM to avoid unnecessary expensive vmexits.
 
-Production Ready?
-~~~~~~~~~~~~~~~~~
+### Production Ready?
 
 Not yet. While tasks are run with an infinite slice (SCX_SLICE_INF), they're
 preempted every 20ms in a timer callback. The scheduler also puts the core
@@ -121,11 +216,9 @@ and does not yet have any kind of priority mechanism.
 
 --------------------------------------------------------------------------------
 
-scx_flatcg
-----------
+## scx_flatcg
 
-Overview
-~~~~~~~~
+### Overview
 
 A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
 weight-based cgroup CPU control by flattening the cgroup hierarchy into a
@@ -133,15 +226,13 @@ single layer, by compounding the active weight share at each level. The effect
 of this is a much more performant CPU controller, which does not need to
 descend down cgroup trees in order to properly compute a cgroup's share.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 This scheduler could be useful for any typical workload requiring a CPU
 controller, but which cannot tolerate the higher overheads of the fair CPU
 controller.
 
-Production Ready?
-~~~~~~~~~~~~~~~~~
+### Production Ready?
 
 Yes, though the scheduler (currently) does not adequately accommodate
 thundering herds of cgroups. If, for example, many cgroups which are nested
@@ -150,75 +241,64 @@ able to consume more CPU cycles than they are entitled to.
 
 --------------------------------------------------------------------------------
 
-scx_pair
---------
+## scx_pair
 
-Overview
-~~~~~~~~
+### Overview
 
 A sibling scheduler which ensures that tasks will only ever be co-located on a
 physical core if they're in the same cgroup. It illustrates how a scheduling
 policy could be implemented to mitigate CPU bugs, such as L1TF, and also shows
-how some useful kfuncs such as scx_bpf_kick_cpu() can be utilized.
+how some useful kfuncs such as `scx_bpf_kick_cpu()` can be utilized.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 While this scheduler is only meant to be used to illustrate certain sched_ext
 features, with a bit more work (e.g. by adding some form of priority handling
 inside and across cgroups), it could have been used as a way to quickly
 mitigate L1TF before core scheduling was implemented and rolled out.
 
-Production Ready?
-~~~~~~~~~~~~~~~~~
+### Production Ready?
 
 No
 
 --------------------------------------------------------------------------------
 
-scx_qmap
---------
+## scx_qmap
 
-Overview
-~~~~~~~~
+### Overview
 
 Another simple, yet slightly more complex scheduler that provides an example of
 a basic weighted FIFO queuing policy. It also provides examples of some common
 useful BPF features, such as sleepable per-task storage allocation in the
-ops.prep_enable() callback, and using the BPF_MAP_TYPE_QUEUE map type to
+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
 enqueue tasks. It also illustrates how core-sched support could be implemented.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 Purely used to illustrate sched_ext features.
 
-**Production Ready?**
+### Production Ready?
 
 No
 
 --------------------------------------------------------------------------------
 
-scx_simple
-----------
+## scx_simple
 
-Overview
-~~~~~~~~
+### Overview
 
 A simple scheduler that provides an example of a minimal sched_ext
 scheduler. scx_simple can be run in either global weighted vtime mode, or
 FIFO mode.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 Though very simple, this scheduler should perform reasonably well on
 single-socket CPUs with a uniform L3 cache topology. Note that while running in
 global FIFO mode may work well for some workloads, saturating threads can
 easily drown out inactive ones.
 
-Production Ready?
-~~~~~~~~~~~~~~~~~
+### Production Ready?
 
 This scheduler could be used in a production environment, assuming the hardware
 constraints enumerated above, and assuming the workload can accommodate a
@@ -226,18 +306,15 @@ simple scheduling policy.
 
 --------------------------------------------------------------------------------
 
-scx_userland
-------------
+## scx_userland
 
-Overview
-~~~~~~~~
+### Overview
 
 A simple weighted vtime scheduler where all scheduling decisions take place in
 user space. This is in contrast to Rusty, where load balancing lives in user
 space, but scheduling decisions are still made in the kernel.
 
-Typical Use Case
-~~~~~~~~~~~~~~~~
+### Typical Use Case
 
 There are many advantages to writing schedulers in user space. For example, you
 can use a debugger, you can write the scheduler in Rust, and you can use data
@@ -253,10 +330,69 @@ kernel and user space.
 A more robust solution to this would be building a user space scheduling
 framework that abstracts much of this complexity away from you.
 
-Production Ready?
-~~~~~~~~~~~~~~~~~
+### Production Ready?
 
 No. This scheduler uses an ordered list for vtime scheduling, and is stricly
 less performant than just using something like `scx_simple`. It is purely
 meant to illustrate that it's possible to build a user space scheduler on
 top of sched_ext.
+
+# Troubleshooting
+
+There are a number of common issues that you may run into when building the
+schedulers. We'll go over some of the common ones here.
+
+## Build Failures
+
+### Old version of clang
+
+```
+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole
+        _Static_assert(SCX_DSQ_FLAG_BUILTIN,
+                       ^~~~~~~~~~~~~~~~~~~~
+1 error generated.
+```
+
+This means you built the kernel or the schedulers with an older version of
+clang than what's supported (i.e. older than 16.0.0). To remediate this:
+
+1. `which clang` to make sure you're using a sufficiently new version of clang.
+
+2. `make mrproper` in the root path of the repository, and rebuild the kernel.
+
+3. `make clean` in the example scheduler directory and rebuild the schedulers.
+
+### Stale kernel build / incomplete vmlinux.h file
+
+As described above, you'll need a `vmlinux.h` file that was generated from a
+vmlinux built with BTF, and with sched_ext support enabled. If you don't,
+you'll see errors such as the following which indicate that a type being
+referenced in a scheduler is unknown:
+
+```
+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info'
+
+const struct scx_exit_info *ei)
+
+^
+```
+
+In order to resolve this, please follow the steps above in
+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your
+schedulers are using a vmlinux.h file that includes the requisite types.
+
+## Misc
+
+### llvm: [OFF]
+
+You may see the following output when building the schedulers:
+
+```
+Auto-detecting system features:
+...                         clang-bpf-co-re: [ on  ]
+...                                    llvm: [ OFF ]
+...                                  libcap: [ on  ]
+...                                  libbfd: [ on  ]
+```
+
+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore.

From e77257c4350c23348cae753f11f979535d836b2e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 30 Oct 2023 17:00:06 -0500
Subject: [PATCH 122/304] scx: Add missing build/ entry to .gitignore

We're missing an entry in .gitignore for the build-generated files when
building the example schedulers.
---
 tools/sched_ext/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index c63ee5e4f4bb0..215ed36b2a94a 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -7,3 +7,4 @@ scx_userland
 *.skel.h
 *.subskel.h
 /tools/
+build/

From 3f4b8859d64efa01dbed0cec0e830e6f6dbe7028 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 30 Oct 2023 17:26:11 -0500
Subject: [PATCH 123/304] scx: clean sched_ext example schedulers on root
 mrproper target

We've gotten some feedback that it's confusing and/or inconvenient to
know what needs to be clean built in order to be able to correctly
compile and run the example schedulers. Let's update the build targets
to make this simpler by:

1. Always cleaning sched_ext schedulers on make mrproper in the tree
   root
2. Adding a make fullclean target to the sched_ext tools directory which
   also invokes the root make clean target.

Signed-off-by: David Vernet <void@manifault.com>
---
 Makefile                  |  8 +++++++-
 tools/Makefile            | 10 +++++++++-
 tools/sched_ext/Makefile  |  9 ++++++++-
 tools/sched_ext/README.md |  8 ++++++--
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 57698d048e2ca..49152788ae59f 100644
--- a/Makefile
+++ b/Makefile
@@ -1341,6 +1341,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),)
 	$(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
 endif
 
+tools-clean-targets := sched_ext
+PHONY += $(tools-clean-targets)
+$(tools-clean-targets):
+	$(Q)$(MAKE) -sC tools $@_clean
+tools_clean: $(tools-clean-targets)
+
 # Clear a bunch of variables before executing the submake
 ifeq ($(quiet),silent_)
 tools_silent=s
@@ -1510,7 +1516,7 @@ PHONY += $(mrproper-dirs) mrproper
 $(mrproper-dirs):
 	$(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@)
 
-mrproper: clean $(mrproper-dirs)
+mrproper: clean $(mrproper-dirs) tools_clean
 	$(call cmd,rmfiles)
 	@find . $(RCS_FIND_IGNORE) \
 		\( -name '*.rmeta' \) \
diff --git a/tools/Makefile b/tools/Makefile
index 37e9f68048326..8021267f7e5b6 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -29,6 +29,7 @@ help:
 	@echo '  pci                    - PCI tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
 	@echo '  selftests              - various kernel selftests'
+	@echo '  sched_ext              - sched_ext example schedulers'
 	@echo '  bootconfig             - boot config tool'
 	@echo '  spi                    - spi tools'
 	@echo '  tmon                   - thermal monitoring and tuning tool'
@@ -92,6 +93,9 @@ perf: FORCE
 	$(Q)mkdir -p $(PERF_O) .
 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
 
+sched_ext: FORCE
+	$(call descend,sched_ext)
+
 selftests: FORCE
 	$(call descend,testing/$@)
 
@@ -185,6 +189,9 @@ perf_clean:
 	$(Q)mkdir -p $(PERF_O) .
 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
 
+sched_ext_clean:
+	$(call descend,sched_ext,clean)
+
 selftests_clean:
 	$(call descend,testing/$(@:_clean=),clean)
 
@@ -214,6 +221,7 @@ clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_cl
 		mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean \
 		gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
-		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean
+		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \
+		sched_ext_clean
 
 .PHONY: FORCE
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index f1d718c2ff1ce..107aa2613a751 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -228,6 +228,9 @@ clean:
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
 
+fullclean: clean
+	$(Q)$(MAKE) -sC ../../ clean
+
 help:
 	@echo  'Building targets:'
 	@echo  '  all		  - Compile all schedulers'
@@ -282,8 +285,12 @@ help:
 	@echo  'Cleaning targets:'
 	@echo  '  clean		  - Remove all generated files, including intermediate'
 	@echo  '                    rust files for rust schedulers.'
+	@echo  ''
+	@echo  '  fullclean	  - Remove all generated files, including intermediate'
+	@echo  '                    rust files for rust schedulers, and also trigger a'
+	@echo  '                    clean of the kernel at the root of the whole repository.'
 
-.PHONY: all scx_rusty clean help
+.PHONY: all scx_rusty clean fullclean help
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index c5031d073a491..e030b1931ac26 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -358,9 +358,13 @@ clang than what's supported (i.e. older than 16.0.0). To remediate this:
 
 1. `which clang` to make sure you're using a sufficiently new version of clang.
 
-2. `make mrproper` in the root path of the repository, and rebuild the kernel.
+2. `make fullclean` in the root path of the repository, and rebuild the kernel
+   and schedulers.
 
-3. `make clean` in the example scheduler directory and rebuild the schedulers.
+3. Rebuild the kernel, and then your example schedulers.
+
+The schedulers are also cleaned if you invoke `make mrproper` in the root
+directory of the tree.
 
 ### Stale kernel build / incomplete vmlinux.h file
 

From 2a5eb9843c1f1733890e11cc6e462e9cc4bbd6e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 31 Oct 2023 12:17:35 -1000
Subject: [PATCH 124/304] sched_ext: Add scx_layered

---
 tools/sched_ext/Makefile                      |   22 +-
 tools/sched_ext/scx_common.bpf.h              |    8 +-
 tools/sched_ext/scx_layered/.gitignore        |    3 +
 tools/sched_ext/scx_layered/Cargo.toml        |   30 +
 tools/sched_ext/scx_layered/build.rs          |   77 +
 tools/sched_ext/scx_layered/rustfmt.toml      |    8 +
 .../scx_layered/src/bpf/layered.bpf.c         |  947 ++++++++++
 tools/sched_ext/scx_layered/src/bpf/layered.h |   96 +
 .../sched_ext/scx_layered/src/bpf/ravg.bpf.c  |  329 ++++
 .../sched_ext/scx_layered/src/bpf/util.bpf.c  |   68 +
 .../sched_ext/scx_layered/src/layered_sys.rs  |   10 +
 tools/sched_ext/scx_layered/src/main.rs       | 1635 +++++++++++++++++
 12 files changed, 3225 insertions(+), 8 deletions(-)
 create mode 100644 tools/sched_ext/scx_layered/.gitignore
 create mode 100644 tools/sched_ext/scx_layered/Cargo.toml
 create mode 100644 tools/sched_ext/scx_layered/build.rs
 create mode 100644 tools/sched_ext/scx_layered/rustfmt.toml
 create mode 100644 tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
 create mode 100644 tools/sched_ext/scx_layered/src/bpf/layered.h
 create mode 100644 tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c
 create mode 100644 tools/sched_ext/scx_layered/src/bpf/util.bpf.c
 create mode 100644 tools/sched_ext/scx_layered/src/layered_sys.rs
 create mode 100644 tools/sched_ext/scx_layered/src/main.rs

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 107aa2613a751..5c322b21f61fc 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -126,7 +126,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_rusty
+all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_rusty scx_layered
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -218,12 +218,23 @@ scx_rusty: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
 	cargo build --manifest-path=$@/Cargo.toml $(CARGOFLAGS)
 	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
 
+scx_layered_deps:
+	cargo fetch --manifest-path=scx_layered/Cargo.toml
+
+scx_layered: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
+scx_layered: export SCX_LAYERED_CLANG = $(CLANG)
+scx_layered: export SCX_LAYERED_BPF_CFLAGS = $(BPF_CFLAGS)
+scx_layered: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
+	cargo build --manifest-path=$@/Cargo.toml $(CARGOFLAGS)
+	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
+
 install: all
 	$(Q)mkdir -p $(DESTDIR)/usr/bin/
 	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/bin/
 
 clean:
 	cargo clean --manifest-path=scx_rusty/Cargo.toml
+	cargo clean --manifest-path=scx_layered/Cargo.toml
 	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
@@ -243,6 +254,7 @@ help:
 	@echo  '  scx_flatcg'
 	@echo  '  scx_userland'
 	@echo  '  scx_rusty'
+	@echo  '  scx_layered'
 	@echo  ''
 	@echo  'For any scheduler build target, you may specify an alternative'
 	@echo  'build output path with the O= environment variable. For example:'
@@ -254,8 +266,10 @@ help:
 	@echo  ''
 	@echo  ''
 	@echo  'Rust schedulers:'
-	@echo  '  scx_rusty	  - Build the scx_rusty load balancing scheduler.'
-	@echo  '  scx_rusty_deps  - Download the scx_rusty scheduler cargo dependencies.'
+	@echo  '  scx_rusty	   - Build the scx_rusty scheduler.'
+	@echo  '  scx_rusty_deps   - Download the scx_rusty scheduler cargo dependencies.'
+	@echo  '  scx_layered	   - Build the scx_layered scheduler.'
+	@echo  '  scx_layered_deps - Download the scx_layered scheduler cargo dependencies.'
 	@echo  ''
 	@echo  'For any cargo rust schedulers built with cargo, you can specify'
 	@echo  'CARGO_OFFLINE=1 to ensure the build portion does not access the'
@@ -290,7 +304,7 @@ help:
 	@echo  '                    rust files for rust schedulers, and also trigger a'
 	@echo  '                    clean of the kernel at the root of the whole repository.'
 
-.PHONY: all scx_rusty clean fullclean help
+.PHONY: all scx_rusty scx_layered clean fullclean help
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 81bfe3d041c9a..d6b42270b91cb 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -123,9 +123,9 @@ BPF_PROG(name, ##args)
  *		return error;
  *	*vptr = new_value;
  */
-#define MEMBER_VPTR(base, member) (typeof(base member) *)({			\
-	u64 __base = (u64)base;							\
-	u64 __addr = (u64)&(base member) - __base;				\
+#define MEMBER_VPTR(base, member) (typeof((base) member) *)({			\
+	u64 __base = (u64)&(base);						\
+	u64 __addr = (u64)&((base) member) - __base;				\
 	asm volatile (								\
 		"if %0 <= %[max] goto +2\n"					\
 		"%0 = 0\n"							\
@@ -133,7 +133,7 @@ BPF_PROG(name, ##args)
 		"%0 += %1\n"							\
 		: "+r"(__addr)							\
 		: "r"(__base),							\
-		  [max]"i"(sizeof(base) - sizeof(base member)));		\
+		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
 	__addr;									\
 })
 
diff --git a/tools/sched_ext/scx_layered/.gitignore b/tools/sched_ext/scx_layered/.gitignore
new file mode 100644
index 0000000000000..186dba259ec21
--- /dev/null
+++ b/tools/sched_ext/scx_layered/.gitignore
@@ -0,0 +1,3 @@
+src/bpf/.output
+Cargo.lock
+target
diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
new file mode 100644
index 0000000000000..6ba1b98d25cd9
--- /dev/null
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "scx_layered"
+version = "0.0.1"
+authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
+edition = "2021"
+description = "Userspace scheduling with BPF for Ads"
+license = "GPL-2.0-only"
+
+[dependencies]
+anyhow = "1.0"
+bitvec = "1.0"
+clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
+ctrlc = { version = "3.1", features = ["termination"] }
+fb_procfs = "0.7"
+lazy_static = "1.4"
+libbpf-rs = "0.21"
+libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
+libc = "0.2"
+log = "0.4"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+simplelog = "0.12"
+
+[build-dependencies]
+bindgen = { version = "0.61" }
+libbpf-cargo = "0.21"
+glob = "0.3"
+
+[features]
+enable_backtrace = []
diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
new file mode 100644
index 0000000000000..744df9e1e301f
--- /dev/null
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -0,0 +1,77 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+extern crate bindgen;
+
+use std::env;
+use std::fs::create_dir_all;
+use std::path::Path;
+use std::path::PathBuf;
+
+use glob::glob;
+use libbpf_cargo::SkeletonBuilder;
+
+const HEADER_PATH: &str = "src/bpf/layered.h";
+
+fn bindgen_layered() {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed={}", HEADER_PATH);
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header(HEADER_PATH)
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("layered_sys.rs"))
+        .expect("Couldn't write bindings!");
+}
+
+fn gen_bpf_sched(name: &str) {
+    let bpf_cflags = env::var("SCX_LAYERED_BPF_CFLAGS").unwrap();
+    let clang = env::var("SCX_LAYERED_CLANG").unwrap();
+    eprintln!("{}", clang);
+    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
+    let skel = Path::new(&outpath);
+    let src = format!("./src/bpf/{}.bpf.c", name);
+    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
+    SkeletonBuilder::new()
+        .source(src.clone())
+	.obj(obj)
+        .clang(clang)
+        .clang_args(bpf_cflags)
+        .build_and_generate(skel)
+        .unwrap();
+
+    // Trigger rebuild if any .[hc] files are changed in the directory.
+    for path in glob("./src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
+        println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
+    }
+}
+
+fn main() {
+    bindgen_layered();
+    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
+    // Reasons are because the generated skeleton contains compiler attributes
+    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
+    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
+    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
+    //
+    // However, there is hope! When the above feature stabilizes we can clean this
+    // all up.
+    create_dir_all("./src/bpf/.output").unwrap();
+    gen_bpf_sched("layered");
+}
diff --git a/tools/sched_ext/scx_layered/rustfmt.toml b/tools/sched_ext/scx_layered/rustfmt.toml
new file mode 100644
index 0000000000000..b7258ed0a8d84
--- /dev/null
+++ b/tools/sched_ext/scx_layered/rustfmt.toml
@@ -0,0 +1,8 @@
+# Get help on options with `rustfmt --help=config`
+# Please keep these in alphabetical order.
+edition = "2021"
+group_imports = "StdExternalCrate"
+imports_granularity = "Item"
+merge_derives = false
+use_field_init_shorthand = true
+version = "Two"
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
new file mode 100644
index 0000000000000..1ee597fdf86cb
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -0,0 +1,947 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+#include "../../../scx_common.bpf.h"
+#include "layered.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u32 debug = 0;
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 nr_possible_cpus = 1;
+const volatile u32 nr_layers = 1;
+const volatile bool smt_enabled = true;
+const volatile unsigned char all_cpus[MAX_CPUS_U8];
+
+private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
+struct layer layers[MAX_LAYERS];
+u32 fallback_cpu;
+u32 preempt_cursor;
+
+#define dbg(fmt, args...)	do { if (debug) bpf_printk(fmt, ##args); } while (0)
+#define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
+
+#include "util.bpf.c"
+#include "ravg.bpf.c"
+
+struct user_exit_info uei;
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, __u32);
+	__type(value, struct cpu_ctx);
+	__uint(max_entries, 1);
+} cpu_ctxs SEC(".maps");
+
+static struct cpu_ctx *lookup_cpu_ctx(int cpu)
+{
+	struct cpu_ctx *cctx;
+	u32 zero = 0;
+
+	if (cpu < 0)
+		cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero);
+	else
+		cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu);
+
+	if (!cctx) {
+		scx_bpf_error("no cpu_ctx for cpu %d", cpu);
+		return NULL;
+	}
+
+	return cctx;
+}
+
+static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx)
+{
+	if (idx < 0 || idx >= NR_GSTATS) {
+		scx_bpf_error("invalid global stat idx %d", idx);
+		return;
+	}
+
+	cctx->gstats[idx]++;
+}
+
+static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_ctx *cctx)
+{
+	u64 *vptr;
+
+	if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx])))
+		(*vptr)++;
+	else
+		scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx);
+}
+
+static struct layer_load {
+	u64			load;
+	struct ravg_data	ravg_data;
+} layer_loads[MAX_LAYERS];
+
+private(layer_loads) struct bpf_spin_lock layer_loads_lock;
+
+const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
+
+static void adj_load(u32 layer, s64 adj, u64 now)
+{
+	struct layer_load *load = &layer_loads[layer];
+
+	if (layer >= nr_layers) {
+		scx_bpf_error("invalid layer %u", layer);
+		return;
+	}
+
+	bpf_spin_lock(&layer_loads_lock);
+	load->load += adj;
+	ravg_accumulate(&load->ravg_data, load->load, now, USAGE_HALF_LIFE);
+	bpf_spin_unlock(&layer_loads_lock);
+
+	if (debug && adj < 0 && (s64)load->load < 0)
+		scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)",
+			      bpf_get_smp_processor_id(), layer, load->load, adj);
+}
+
+struct layer_cpumask_container {
+	struct bpf_cpumask __kptr *cpumask;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct layer_cpumask_container);
+	__uint(max_entries, MAX_LAYERS);
+	__uint(map_flags, 0);
+} layer_cpumasks SEC(".maps");
+
+static struct cpumask *lookup_layer_cpumask(int idx)
+{
+	struct layer_cpumask_container *cont;
+
+	if ((cont = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
+		return (struct cpumask *)cont->cpumask;
+	} else {
+		scx_bpf_error("no layer_cpumask");
+		return NULL;
+	}
+}
+
+static void refresh_cpumasks(int idx)
+{
+	struct layer_cpumask_container *cont;
+	struct layer *layer;
+	int cpu, total = 0;
+
+	if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0))
+		return;
+
+	cont = bpf_map_lookup_elem(&layer_cpumasks, &idx);
+
+	bpf_for(cpu, 0, nr_possible_cpus) {
+		u8 *u8_ptr;
+
+		if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) {
+			barrier_var(cont);
+			if (!cont || !cont->cpumask) {
+				scx_bpf_error("can't happen");
+				return;
+			}
+			if (*u8_ptr & (1 << (cpu % 8))) {
+				bpf_cpumask_set_cpu(cpu, cont->cpumask);
+				total++;
+			} else {
+				bpf_cpumask_clear_cpu(cpu, cont->cpumask);
+			}
+		} else {
+			scx_bpf_error("can't happen");
+		}
+	}
+
+	// XXX - shouldn't be necessary
+	layer = MEMBER_VPTR(layers, [idx]);
+	if (!layer) {
+		scx_bpf_error("can't happen");
+		return;
+	}
+
+	layer->nr_cpus = total;
+	__sync_fetch_and_add(&layer->cpus_seq, 1);
+	trace("LAYER[%d] now has %d cpus, seq=%llu", idx, layer->nr_cpus, layer->cpus_seq);
+}
+
+SEC("fentry/scheduler_tick")
+int scheduler_tick_fentry(const void *ctx)
+{
+	u64 now;
+	int idx;
+
+	if (bpf_get_smp_processor_id() != 0)
+		return 0;
+
+	now = bpf_ktime_get_ns();
+	bpf_for(idx, 0, nr_layers) {
+		layers[idx].load_avg = ravg_read(&layer_loads[idx].ravg_data,
+						 now, USAGE_HALF_LIFE);
+		refresh_cpumasks(idx);
+	}
+	return 0;
+}
+
+struct task_ctx {
+	int			pid;
+
+	int			layer;
+	bool			refresh_layer;
+	u64			layer_cpus_seq;
+	struct bpf_cpumask __kptr *layered_cpumask;
+
+	bool			all_cpus_allowed;
+	bool			dispatch_local;
+	u64			started_running_at;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, struct task_ctx);
+	__uint(max_entries, MAX_TASKS);
+	__uint(map_flags, 0);
+} task_ctxs SEC(".maps");
+
+struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p)
+{
+	s32 pid = p->pid;
+
+	return bpf_map_lookup_elem(&task_ctxs, &pid);
+}
+
+struct task_ctx *lookup_task_ctx(struct task_struct *p)
+{
+	struct task_ctx *tctx;
+	s32 pid = p->pid;
+
+	if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid))) {
+		return tctx;
+	} else {
+		scx_bpf_error("task_ctx lookup failed");
+		return NULL;
+	}
+}
+
+struct layer *lookup_layer(int idx)
+{
+	if (idx < 0 || idx >= nr_layers) {
+		scx_bpf_error("invalid layer %d", idx);
+		return NULL;
+	}
+	return &layers[idx];
+}
+
+SEC("tp_btf/cgroup_attach_task")
+int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
+	     struct task_struct *leader, bool threadgroup)
+{
+	struct task_struct *next;
+	struct task_ctx *tctx;
+	int leader_pid = leader->pid;
+
+	if (!(tctx = lookup_task_ctx_may_fail(leader)))
+		return 0;
+	tctx->refresh_layer = true;
+
+	if (!threadgroup)
+		return 0;
+
+	if (!(next = bpf_task_acquire(leader))) {
+		scx_bpf_error("failed to acquire leader");
+		return 0;
+	}
+
+	bpf_repeat(MAX_TASKS) {
+		struct task_struct *p;
+		int pid;
+
+		p = container_of(next->thread_group.next, struct task_struct, thread_group);
+		bpf_task_release(next);
+
+		pid = BPF_CORE_READ(p, pid);
+		if (pid == leader_pid) {
+			next = NULL;
+			break;
+		}
+
+		next = bpf_task_from_pid(pid);
+		if (!next) {
+			scx_bpf_error("thread iteration failed");
+			break;
+		}
+
+		if ((tctx = lookup_task_ctx(next)))
+			tctx->refresh_layer = true;
+	}
+
+	if (next)
+		bpf_task_release(next);
+	return 0;
+}
+
+SEC("fentry/__set_task_comm")
+int BPF_PROG(fentry_set_task_comm, struct task_struct *p, const char *buf, bool exec)
+{
+	struct task_ctx *tctx;
+
+	if ((tctx = lookup_task_ctx_may_fail(p)))
+		tctx->refresh_layer = true;
+	return 0;
+}
+
+static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
+					  struct task_struct *p, struct task_ctx *tctx,
+					  const struct cpumask *layer_cpumask)
+{
+	u64 layer_seq = layers->cpus_seq;
+
+	if (tctx->layer_cpus_seq == layer_seq)
+		return;
+
+	bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr);
+	tctx->layer_cpus_seq = layer_seq;
+	trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
+}
+
+static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
+			      const struct cpumask *idle_cpumask,
+			      const struct cpumask *idle_smtmask)
+{
+	bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
+	s32 cpu;
+
+	/*
+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
+	 * partially idle @prev_cpu.
+	 */
+	if (smt_enabled) {
+		if (prev_in_cand &&
+		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+			return prev_cpu;
+
+		cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE);
+		if (cpu >= 0)
+			return cpu;
+	}
+
+	if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+		return prev_cpu;
+
+	return scx_bpf_pick_idle_cpu(cand_cpumask, 0);
+}
+
+s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	const struct cpumask *idle_cpumask, *idle_smtmask;
+	struct cpumask *layer_cpumask, *layered_cpumask;
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	struct layer *layer;
+	s32 cpu;
+
+	/* look up everything we need */
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
+	    !(layered_cpumask = (struct cpumask *)tctx->layered_cpumask))
+		return prev_cpu;
+
+	/*
+	 * We usually update the layer in layered_runnable() to avoid confusing.
+	 * As layered_select_cpu() takes place before runnable, new tasks would
+	 * still have -1 layer. Just return @prev_cpu.
+	 */
+	if (tctx->layer < 0)
+		return prev_cpu;
+
+	if (!(layer = lookup_layer(tctx->layer)) ||
+	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
+		return prev_cpu;
+
+	if (!(idle_cpumask = scx_bpf_get_idle_cpumask()))
+		return prev_cpu;
+
+	if (!(idle_smtmask = scx_bpf_get_idle_smtmask())) {
+		cpu = prev_cpu;
+		goto out_put_idle_cpumask;
+	}
+
+	/* not much to do if bound to a single CPU */
+	if (p->nr_cpus_allowed == 1) {
+		cpu = prev_cpu;
+		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			if (!bpf_cpumask_test_cpu(cpu, layer_cpumask))
+				lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
+			goto dispatch_local;
+		} else {
+			goto out_put_cpumasks;
+		}
+	}
+
+	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
+
+	/*
+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
+	 * partially idle @prev_cpu.
+	 */
+	if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu,
+				      idle_cpumask, idle_smtmask)) >= 0)
+		goto dispatch_local;
+
+	/*
+	 * If the layer is an open one, we can try the whole machine.
+	 */
+	if (layer->open &&
+	    ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
+				       idle_cpumask, idle_smtmask)) >= 0)) {
+		lstat_inc(LSTAT_OPEN_IDLE, layer, cctx);
+		goto dispatch_local;
+	}
+
+	cpu = prev_cpu;
+	goto out_put_cpumasks;
+
+dispatch_local:
+	tctx->dispatch_local = true;
+out_put_cpumasks:
+	scx_bpf_put_idle_cpumask(idle_smtmask);
+out_put_idle_cpumask:
+	scx_bpf_put_idle_cpumask(idle_cpumask);
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	struct layer *layer;
+	u64 vtime = p->scx.dsq_vtime;
+	u32 idx;
+
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
+	    !(layer = lookup_layer(tctx->layer)))
+		return;
+
+	if (tctx->dispatch_local) {
+		tctx->dispatch_local = false;
+		lstat_inc(LSTAT_LOCAL, layer, cctx);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+		return;
+	}
+
+	lstat_inc(LSTAT_GLOBAL, layer, cctx);
+
+	/*
+	 * Limit the amount of budget that an idling task can accumulate
+	 * to one slice.
+	 */
+	if (vtime_before(vtime, layer->vtime_now - slice_ns))
+		vtime = layer->vtime_now - slice_ns;
+
+	if (!tctx->all_cpus_allowed) {
+		lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
+		return;
+	}
+
+	scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags);
+
+	if (!layer->preempt)
+		return;
+
+	bpf_for(idx, 0, nr_possible_cpus) {
+		struct cpu_ctx *cand_cctx;
+		u32 cpu = (preempt_cursor + idx) % nr_possible_cpus;
+
+		if (!all_cpumask ||
+		    !bpf_cpumask_test_cpu(cpu, (const struct cpumask *)all_cpumask))
+			continue;
+		if (!(cand_cctx = lookup_cpu_ctx(cpu)) || cand_cctx->current_preempt)
+			continue;
+
+		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+		preempt_cursor = (cpu + 1) % nr_possible_cpus;
+		lstat_inc(LSTAT_PREEMPT, layer, cctx);
+		break;
+	}
+}
+
+void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
+{
+	int idx;
+
+	/* consume preempting layers first */
+	bpf_for(idx, 0, nr_layers)
+		if (layers[idx].preempt && scx_bpf_consume(idx))
+			return;
+
+	/* consume !open layers second */
+	bpf_for(idx, 0, nr_layers) {
+		struct layer *layer = &layers[idx];
+		struct cpumask *layer_cpumask;
+
+		if (layer->open)
+			continue;
+
+		/* consume matching layers */
+		if (!(layer_cpumask = lookup_layer_cpumask(idx)))
+			return;
+
+		if (bpf_cpumask_test_cpu(cpu, layer_cpumask)) {
+			if (scx_bpf_consume(idx))
+				return;
+		} else if (cpu == fallback_cpu && layer->nr_cpus == 0) {
+			if (scx_bpf_consume(idx))
+				return;
+		}
+	}
+
+	/* consume !preempting open layers */
+	bpf_for(idx, 0, nr_layers) {
+		if (!layers[idx].preempt && layers[idx].open &&
+		    scx_bpf_consume(idx))
+			return;
+	}
+}
+
+static bool match_one(struct layer_match *match, struct task_struct *p, const char *cgrp_path)
+{
+	switch (match->kind) {
+	case MATCH_CGROUP_PREFIX: {
+		return match_prefix(match->cgroup_prefix, cgrp_path, MAX_PATH);
+	}
+	case MATCH_COMM_PREFIX: {
+		char comm[MAX_COMM];
+		memcpy(comm, p->comm, MAX_COMM);
+		return match_prefix(match->comm_prefix, comm, MAX_COMM);
+	}
+	case MATCH_NICE_ABOVE:
+		return (s32)p->static_prio - 120 > match->nice_above_or_below;
+	case MATCH_NICE_BELOW:
+		return (s32)p->static_prio - 120 < match->nice_above_or_below;
+	default:
+		scx_bpf_error("invalid match kind %d", match->kind);
+		return false;
+	}
+}
+
+static bool match_layer(struct layer *layer, struct task_struct *p, const char *cgrp_path)
+{
+	u32 nr_match_ors = layer->nr_match_ors;
+	u64 or_idx, and_idx;
+
+	if (nr_match_ors > MAX_LAYER_MATCH_ORS) {
+		scx_bpf_error("too many ORs");
+		return false;
+	}
+
+	bpf_for(or_idx, 0, nr_match_ors) {
+		struct layer_match_ands *ands;
+		bool matched = true;
+
+		barrier_var(or_idx);
+		if (or_idx >= MAX_LAYER_MATCH_ORS)
+			return false; /* can't happen */
+		ands = &layer->matches[or_idx];
+
+		if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
+			scx_bpf_error("too many ANDs");
+			return false;
+		}
+
+		bpf_for(and_idx, 0, ands->nr_match_ands) {
+			struct layer_match *match;
+
+			barrier_var(and_idx);
+			if (and_idx >= NR_LAYER_MATCH_KINDS)
+				return false; /* can't happen */
+			match = &ands->matches[and_idx];
+
+			if (!match_one(match, p, cgrp_path)) {
+				matched = false;
+				break;
+			}
+		}
+
+		if (matched)
+			return true;
+	}
+
+	return false;
+}
+
+static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
+{
+	const char *cgrp_path;
+	bool matched = false;
+	u64 idx;	// XXX - int makes verifier unhappy
+
+	if (!tctx->refresh_layer)
+		return;
+	tctx->refresh_layer = false;
+
+	if (!(cgrp_path = format_cgrp_path(p->cgroups->dfl_cgrp)))
+		return;
+
+	if (tctx->layer >= 0 && tctx->layer < nr_layers)
+		__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
+
+	bpf_for(idx, 0, nr_layers) {
+		if (match_layer(&layers[idx], p, cgrp_path)) {
+			matched = true;
+			break;
+		}
+	}
+
+	if (matched) {
+		struct layer *layer = &layers[idx];
+
+		tctx->layer = idx;
+		tctx->layer_cpus_seq = layer->cpus_seq - 1;
+		__sync_fetch_and_add(&layer->nr_tasks, 1);
+		/*
+		 * XXX - To be correct, we'd need to calculate the vtime
+		 * delta in the previous layer, scale it by the load
+		 * fraction difference and then offset from the new
+		 * layer's vtime_now. For now, just do the simple thing
+		 * and assume the offset to be zero.
+		 *
+		 * Revisit if high frequency dynamic layer switching
+		 * needs to be supported.
+		 */
+		p->scx.dsq_vtime = layer->vtime_now;
+	} else {
+		scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid);
+	}
+
+	if (tctx->layer < nr_layers - 1)
+		trace("LAYER=%d %s[%d] cgrp=\"%s\"",
+		      tctx->layer, p->comm, p->pid, cgrp_path);
+}
+
+void BPF_STRUCT_OPS(layered_runnable, struct task_struct *p, u64 enq_flags)
+{
+	u64 now = bpf_ktime_get_ns();
+	struct task_ctx *tctx;
+
+	if (!(tctx = lookup_task_ctx(p)))
+		return;
+
+	maybe_refresh_layer(p, tctx);
+
+	adj_load(tctx->layer, p->scx.weight, now);
+}
+
+void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	struct layer *layer;
+
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
+	    !(layer = lookup_layer(tctx->layer)))
+		return;
+
+	if (vtime_before(layer->vtime_now, p->scx.dsq_vtime))
+		layer->vtime_now = p->scx.dsq_vtime;
+
+	cctx->current_preempt = layer->preempt;
+	tctx->started_running_at = bpf_ktime_get_ns();
+}
+
+void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	u64 used;
+	u32 layer;
+
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
+		return;
+
+	layer = tctx->layer;
+	if (layer >= nr_layers) {
+		scx_bpf_error("invalid layer %u", layer);
+		return;
+	}
+
+	used = bpf_ktime_get_ns() - tctx->started_running_at;
+	cctx->layer_cycles[layer] += used;
+	cctx->current_preempt = false;
+
+	/* scale the execution time by the inverse of the weight and charge */
+	p->scx.dsq_vtime += used * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(layered_quiescent, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	if ((tctx = lookup_task_ctx(p)))
+		adj_load(tctx->layer, -(s64)p->scx.weight, bpf_ktime_get_ns());
+}
+
+void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
+{
+	struct task_ctx *tctx;
+
+	if ((tctx = lookup_task_ctx(p)))
+		tctx->refresh_layer = true;
+}
+
+void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{
+	struct task_ctx *tctx;
+	pid_t pid = p->pid;
+
+	if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid)) && all_cpumask)
+		tctx->all_cpus_allowed =
+			bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
+	else
+		scx_bpf_error("missing task_ctx or all_cpumask");
+}
+
+s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	struct task_ctx tctx_init = {
+		.pid = p->pid,
+		.layer = -1,
+		.refresh_layer = true,
+	};
+	struct task_ctx *tctx;
+	struct bpf_cpumask *cpumask;
+	s32 pid = p->pid;
+	s32 ret;
+
+	if (all_cpumask)
+		tctx_init.all_cpus_allowed =
+			bpf_cpumask_subset((const struct cpumask *)all_cpumask, p->cpus_ptr);
+	else
+		scx_bpf_error("missing all_cpumask");
+
+	/*
+	 * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
+	 * fail spuriously due to BPF recursion protection triggering
+	 * unnecessarily.
+	 */
+	if ((ret = bpf_map_update_elem(&task_ctxs, &pid, &tctx_init, 0 /*BPF_NOEXIST*/))) {
+		scx_bpf_error("task_ctx allocation failure, ret=%d", ret);
+		return ret;
+	}
+
+	/*
+	 * Read the entry from the map immediately so we can add the cpumask
+	 * with bpf_kptr_xchg().
+	 */
+	if (!(tctx = lookup_task_ctx(p)))
+		return -ENOENT;
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		bpf_map_delete_elem(&task_ctxs, &pid);
+		return -ENOMEM;
+	}
+
+	cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask);
+	if (cpumask) {
+		/* Should never happen as we just inserted it above. */
+		bpf_cpumask_release(cpumask);
+		bpf_map_delete_elem(&task_ctxs, &pid);
+		return -EINVAL;
+	}
+
+	/*
+	 * We are matching cgroup hierarchy path directly rather than the CPU
+	 * controller path. As the former isn't available during the scheduler
+	 * fork path, let's delay the layer selection until the first
+	 * runnable().
+	 */
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
+{
+	s32 pid = p->pid;
+
+	bpf_map_delete_elem(&task_ctxs, &pid);
+}
+
+void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
+{
+	struct cpu_ctx *cctx;
+	struct task_ctx *tctx;
+	s32 pid = p->pid;
+	int ret;
+
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
+		return;
+
+	if (tctx->layer >= 0 && tctx->layer < nr_layers)
+		__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
+
+	/*
+	 * XXX - There's no reason delete should fail here but BPF's recursion
+	 * protection can unnecessarily fail the operation. The fact that
+	 * deletions aren't reliable means that we sometimes leak task_ctx and
+	 * can't use BPF_NOEXIST on allocation in .prep_enable().
+	 */
+	ret = bpf_map_delete_elem(&task_ctxs, &pid);
+	if (ret)
+		gstat_inc(GSTAT_TASK_CTX_FREE_FAILED, cctx);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
+{
+	struct bpf_cpumask *cpumask;
+	int i, j, k, nr_online_cpus, ret;
+
+	scx_bpf_switch_all();
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+
+	nr_online_cpus = 0;
+	bpf_for(i, 0, nr_possible_cpus) {
+		const volatile u8 *u8_ptr;
+
+		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
+			if (*u8_ptr & (1 << (i % 8))) {
+				bpf_cpumask_set_cpu(i, cpumask);
+				nr_online_cpus++;
+			}
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
+	dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d",
+	    nr_online_cpus, smt_enabled);
+
+	bpf_for(i, 0, nr_layers) {
+		struct layer *layer = &layers[i];
+
+		dbg("CFG LAYER[%d] open=%d preempt=%d",
+		    i, layer->open, layer->preempt);
+
+		if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {
+			scx_bpf_error("too many ORs");
+			return -EINVAL;
+		}
+
+		bpf_for(j, 0, layer->nr_match_ors) {
+			struct layer_match_ands *ands = MEMBER_VPTR(layers, [i].matches[j]);
+			if (!ands) {
+				scx_bpf_error("shouldn't happen");
+				return -EINVAL;
+			}
+
+			if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
+				scx_bpf_error("too many ANDs");
+				return -EINVAL;
+			}
+
+			dbg("CFG   OR[%02d]", j);
+
+			bpf_for(k, 0, ands->nr_match_ands) {
+				char header[32];
+				u64 header_data[1] = { k };
+				struct layer_match *match;
+
+				bpf_snprintf(header, sizeof(header), "CFG     AND[%02d]:",
+					     header_data, sizeof(header_data));
+
+				match = MEMBER_VPTR(layers, [i].matches[j].matches[k]);
+				if (!match) {
+					scx_bpf_error("shouldn't happen");
+					return -EINVAL;
+				}
+
+				switch (match->kind) {
+				case MATCH_CGROUP_PREFIX:
+					dbg("%s CGROUP_PREFIX \"%s\"", header, match->cgroup_prefix);
+					break;
+				case MATCH_COMM_PREFIX:
+					dbg("%s COMM_PREFIX \"%s\"", header, match->comm_prefix);
+					break;
+				case MATCH_NICE_ABOVE:
+					dbg("%s NICE_ABOVE %d", header, match->nice_above_or_below);
+					break;
+				case MATCH_NICE_BELOW:
+					dbg("%s NICE_BELOW %d", header, match->nice_above_or_below);
+					break;
+				default:
+					scx_bpf_error("%s Invalid kind", header);
+					return -EINVAL;
+				}
+			}
+			if (ands->nr_match_ands == 0)
+				dbg("CFG     DEFAULT");
+		}
+	}
+
+	bpf_for(i, 0, nr_layers) {
+		struct layer_cpumask_container *cont;
+
+		layers[i].idx = i;
+
+		ret = scx_bpf_create_dsq(i, -1);
+		if (ret < 0)
+			return ret;
+
+		if (!(cont = bpf_map_lookup_elem(&layer_cpumasks, &i)))
+			return -ENONET;
+
+		cpumask = bpf_cpumask_create();
+		if (!cpumask)
+			return -ENOMEM;
+		cpumask = bpf_kptr_xchg(&cont->cpumask, cpumask);
+		if (cpumask)
+			bpf_cpumask_release(cpumask);
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops layered = {
+	.select_cpu		= (void *)layered_select_cpu,
+	.enqueue		= (void *)layered_enqueue,
+	.dispatch		= (void *)layered_dispatch,
+	.runnable		= (void *)layered_runnable,
+	.running		= (void *)layered_running,
+	.stopping		= (void *)layered_stopping,
+	.quiescent		= (void *)layered_quiescent,
+	.set_weight		= (void *)layered_set_weight,
+	.set_cpumask		= (void *)layered_set_cpumask,
+	.prep_enable		= (void *)layered_prep_enable,
+	.cancel_enable		= (void *)layered_cancel_enable,
+	.disable		= (void *)layered_disable,
+	.init			= (void *)layered_init,
+	.exit			= (void *)layered_exit,
+	.name			= "layered",
+};
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/layered.h
new file mode 100644
index 0000000000000..3191326763b84
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.h
@@ -0,0 +1,96 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#ifndef __LAYERED_H
+#define __LAYERED_H
+
+#include <stdbool.h>
+#ifndef __kptr
+#ifdef __KERNEL__
+#error "__kptr_ref not defined in the kernel"
+#endif
+#define __kptr
+#endif
+
+#ifndef __KERNEL__
+typedef unsigned long long __u64;
+typedef long long __s64;
+#endif
+
+enum consts {
+	MAX_CPUS_SHIFT		= 9,
+	MAX_CPUS		= 1 << MAX_CPUS_SHIFT,
+	MAX_CPUS_U8		= MAX_CPUS / 8,
+	MAX_TASKS		= 131072,
+	MAX_PATH		= 4096,
+	MAX_COMM		= 16,
+	MAX_LAYER_MATCH_ORS	= 32,
+	MAX_LAYERS		= 16,
+	USAGE_HALF_LIFE		= 1 * 100000000,	/* 100ms */
+
+	/* XXX remove */
+	MAX_CGRP_PREFIXES = 32
+};
+
+/* Statistics */
+enum global_stat_idx {
+	GSTAT_TASK_CTX_FREE_FAILED,
+	NR_GSTATS,
+};
+
+enum layer_stat_idx {
+	LSTAT_LOCAL,
+	LSTAT_GLOBAL,
+	LSTAT_OPEN_IDLE,
+	LSTAT_AFFN_VIOL,
+	LSTAT_PREEMPT,
+	NR_LSTATS,
+};
+
+struct cpu_ctx {
+	bool			current_preempt;
+	__u64			layer_cycles[MAX_LAYERS];
+	__u64			gstats[NR_GSTATS];
+	__u64			lstats[MAX_LAYERS][NR_LSTATS];
+};
+
+enum layer_match_kind {
+	MATCH_CGROUP_PREFIX,
+	MATCH_COMM_PREFIX,
+	MATCH_NICE_ABOVE,
+	MATCH_NICE_BELOW,
+
+	NR_LAYER_MATCH_KINDS,
+};
+
+struct layer_match {
+	int		kind;
+	char		cgroup_prefix[MAX_PATH];
+	char		comm_prefix[MAX_COMM];
+	int		nice_above_or_below;
+};
+
+struct layer_match_ands {
+	struct layer_match	matches[NR_LAYER_MATCH_KINDS];
+	int			nr_match_ands;
+};
+
+struct layer {
+	struct layer_match_ands	matches[MAX_LAYER_MATCH_ORS];
+	unsigned int		nr_match_ors;
+	unsigned int		idx;
+	bool			open;
+	bool			preempt;
+
+	__u64			vtime_now;
+	__u64			nr_tasks;
+	__u64			load_avg;
+
+	__u64			cpus_seq;
+	unsigned int		refresh_cpus;
+	unsigned char		cpus[MAX_CPUS_U8];
+	unsigned int		nr_cpus;	// managed from BPF side
+};
+
+#endif /* __LAYERED_H */
diff --git a/tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c b/tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c
new file mode 100644
index 0000000000000..91637624fd59b
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c
@@ -0,0 +1,329 @@
+/* to be included in the main bpf.c file */
+
+#define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
+//#define RAVG_FN_ATTRS		__attribute__((unused))
+
+/*
+ * Running average helpers to be used in BPF progs. Assumes vmlinux.h has
+ * already been included.
+ */
+enum ravg_consts {
+	RAVG_VAL_BITS		= 44,		/* input values are 44bit */
+	RAVG_FRAC_BITS		= 20,		/* 1048576 is 1.0 */
+};
+
+/*
+ * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in
+ * arbitrary time intervals. The accumulated values are halved every half_life
+ * with each period starting when the current time % half_life is 0. Zeroing is
+ * enough for initialization.
+ *
+ * See ravg_accumulate() and ravg_read() for more details.
+ */
+struct ravg_data {
+	/* current value */
+	__u64			val;
+
+	/*
+	 * The timestamp of @val. The latest completed seq #:
+	 *
+	 *   (val_at / half_life) - 1
+	 */
+	__u64			val_at;
+
+	/* running avg as of the latest completed seq  */
+	__u64			old;
+
+	/*
+	 * Accumulated value of the current period. Input value is 48bits and we
+	 * normalize half-life to 16bit, so it should fit in an u64.
+	 */
+	__u64			cur;
+};
+
+static RAVG_FN_ATTRS void ravg_add(__u64 *sum, __u64 addend)
+{
+	__u64 new = *sum + addend;
+
+	if (new >= *sum)
+		*sum = new;
+	else
+		*sum = -1;
+}
+
+static RAVG_FN_ATTRS __u64 ravg_decay(__u64 v, __u32 shift)
+{
+	if (shift >= 64)
+		return 0;
+	else
+		return v >> shift;
+}
+
+static RAVG_FN_ATTRS __u32 ravg_normalize_dur(__u32 dur, __u32 half_life)
+{
+	if (dur < half_life)
+		return (((__u64)dur << RAVG_FRAC_BITS) + half_life - 1) /
+			half_life;
+	else
+		return 1 << RAVG_FRAC_BITS;
+}
+
+/*
+ * Pre-computed decayed full-period values. This is quicker and keeps the bpf
+ * verifier happy by removing the need for looping.
+ *
+ * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1)
+ * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2)
+ * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3)
+ * ...
+ */
+static __u64 ravg_full_sum[] = {
+	 524288,  786432,  917504,  983040,
+	1015808, 1032192, 1040384, 1044480,
+	1046528, 1047552, 1048064, 1048320,
+	1048448, 1048512, 1048544, 1048560,
+	1048568, 1048572, 1048574, 1048575,
+	/* the same from here on */
+};
+
+static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]);
+
+/**
+ * ravg_accumulate - Accumulate a new value
+ * @rd: ravg_data to accumulate into
+ * @new_val: new value
+ * @now: current timestamp
+ * @half_life: decay period, must be the same across calls
+ *
+ * The current value is changing to @val at @now. Accumulate accordingly.
+ */
+static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
+					  __u64 new_val, __u64 now,
+					  __u32 half_life)
+{
+	__u32 cur_seq, val_seq, seq_delta;
+
+	/*
+	 * It may be difficult for the caller to guarantee monotonic progress if
+	 * multiple CPUs accumulate to the same ravg_data. Handle @now being in
+	 * the past of @rd->val_at.
+	 */
+	if (now < rd->val_at)
+		now = rd->val_at;
+
+	cur_seq = now / half_life;
+	val_seq = rd->val_at / half_life;
+	seq_delta = cur_seq - val_seq;
+
+	/*
+	 * Decay ->old and fold ->cur into it.
+	 *
+	 *                                                          @end
+	 *                                                            v
+	 * timeline     |---------|---------|---------|---------|---------|
+	 * seq delta         4         3         2         1          0
+	 * seq            ->seq                                    cur_seq
+	 * val            ->old     ->cur                  ^
+	 *                   |         |                   |
+	 *                   \---------+------------------/
+	 */
+	if (seq_delta > 0) {
+		/* decay ->old to bring it upto the cur_seq - 1 */
+		rd->old = ravg_decay(rd->old, seq_delta);
+		/* non-zero ->cur must be from val_seq, calc and fold */
+		ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta));
+		/* clear */
+		rd->cur = 0;
+	}
+
+	if (!rd->val)
+		goto out;
+
+	/*
+	 * Accumulate @rd->val between @rd->val_at and @now.
+	 *
+	 *                       @rd->val_at                        @now
+	 *                            v                               v
+	 * timeline     |---------|---------|---------|---------|---------|
+	 * seq delta                  [  3  |    2    |    1    |  0  ]
+	 */
+	if (seq_delta > 0) {
+		__u32 dur;
+
+		/* fold the oldest period which may be partial */
+		dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life);
+		ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta));
+
+		/* fold the full periods in the middle with precomputed vals */
+		if (seq_delta > 1) {
+			__u32 idx = seq_delta - 2;
+
+			if (idx < ravg_full_sum_len)
+				ravg_add(&rd->old, rd->val *
+					 ravg_full_sum[idx]);
+			else
+				ravg_add(&rd->old, rd->val *
+					 ravg_full_sum[ravg_full_sum_len - 2]);
+		}
+
+		/* accumulate the current period duration into ->runtime */
+		rd->cur += rd->val * ravg_normalize_dur(now % half_life,
+							half_life);
+	} else {
+		rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at,
+							half_life);
+	}
+out:
+	if (new_val >= 1LLU << RAVG_VAL_BITS)
+		rd->val = (1LLU << RAVG_VAL_BITS) - 1;
+	else
+		rd->val = new_val;
+	rd->val_at = now;
+}
+
+/**
+ * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift)
+ * @a: multiplicand
+ * @b: multiplier
+ * @rshift: number of bits to shift right
+ *
+ * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is
+ * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must
+ * ensure that the final shifted result fits in u64.
+ */
+static __u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
+{
+	const __u64 mask32 = (__u32)-1;
+	__u64 al = a & mask32;
+	__u64 ah = (a & (mask32 << 32)) >> 32;
+
+	/*
+	 *                                        ah: high 32     al: low 32
+	 * a                                   |--------------||--------------|
+	 *
+	 * ah * b              |--------------||--------------|
+	 * al * b                              |--------------||--------------|
+	 */
+	al *= b;
+	ah *= b;
+
+	/*
+	 * (ah * b) >> rshift        |--------------||--------------|
+	 * (al * b) >> rshift                        |--------------||--------|
+	 *                                                           <-------->
+	 *                                                           32 - rshift
+	 */
+	al >>= rshift;
+	if (rshift <= 32)
+		ah <<= 32 - rshift;
+	else
+		ah >>= rshift - 32;
+
+	return al + ah;
+}
+
+/**
+ * ravg_read - Read the current running avg
+ * @rd: ravg_data to read from
+ * @now: timestamp as of which to read the running avg
+ * @half_life: decay period, must match ravg_accumulate()'s
+ *
+ * Read running avg from @rd as of @now.
+ */
+static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
+				     __u64 half_life)
+{
+	struct ravg_data trd;
+	__u32 elapsed = now % half_life;
+
+	/*
+	 * Accumulate the ongoing period into a temporary copy. This allows
+	 * external readers to access up-to-date avg without strongly
+	 * synchronizing with the updater (we need to add a seq lock tho).
+	 */
+	trd = *rd;
+	rd = &trd;
+	ravg_accumulate(rd, 0, now, half_life);
+
+	/*
+	 * At the beginning of a new half_life period, the running avg is the
+	 * same as @rd->old. At the beginning of the next, it'd be old load / 2
+	 * + current load / 2. Inbetween, we blend the two linearly.
+	 */
+	if (elapsed) {
+		__u32 progress = ravg_normalize_dur(elapsed, half_life);
+		/*
+		 * `H` is the duration of the half-life window, and `E` is how
+		 * much time has elapsed in this window. `P` is [0.0, 1.0]
+		 * representing how much the current window has progressed:
+		 *
+		 *   P = E / H
+		 *
+		 * If `old` is @rd->old, we would want to calculate the
+		 * following for blending:
+		 *
+		 *   old * (1.0 - P / 2)
+		 *
+		 * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply
+		 * and then divide by 1 << RAVG_FRAC_BITS:
+		 *
+		 *         (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2
+		 *   old * -----------------------------------------------------
+		 *                       1 << RAVG_FRAC_BITS
+		 *
+		 * As @progress is (1 << RAVG_FRAC_BITS) * P:
+		 *
+		 *         (1 << RAVG_FRAC_BITS) - progress / 2
+		 *   old * ------------------------------------
+		 *                1 << RAVG_FRAC_BITS
+		 *
+		 * As @rd->old uses full 64bit, the multiplication can overflow,
+		 * but we also know that the final result is gonna be smaller
+		 * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle
+		 * the interim multiplication correctly.
+		 */
+		__u64 old = u64_x_u32_rshift(rd->old,
+					(1 << RAVG_FRAC_BITS) - progress / 2,
+					RAVG_FRAC_BITS);
+		/*
+		 * If `S` is the Sum(val * duration) for this half-life window,
+		 * the avg for this window is:
+		 *
+		 *   S / E
+		 *
+		 * We would want to calculate the following for blending:
+		 *
+		 *   S / E * (P / 2)
+		 *
+		 * As P = E / H,
+		 *
+		 *   S / E * (E / H / 2)
+		 *   S / H / 2
+		 *
+		 * Expanding S, the above becomes:
+		 *
+		 *   Sum(val * duration) / H / 2
+		 *   Sum(val * (duration / H)) / 2
+		 *
+		 * As we use RAVG_FRAC_BITS bits for fixed point arithmetic,
+		 * let's multiply the whole result accordingly:
+		 *
+		 *   (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS)
+		 *
+		 *             duration * (1 << RAVG_FRAC_BITS)
+		 *   Sum(val * --------------------------------) / 2
+		 *                            H
+		 *
+		 * The righthand multiplier inside Sum() is the normalized
+		 * duration returned from ravg_normalize_dur(), so, the whole
+		 * Sum term equals @rd->cur.
+		 *
+		 *  rd->cur / 2
+		 */
+		__u64 cur = rd->cur / 2;
+
+		return old + cur;
+	} else {
+		return rd->old;
+	}
+}
diff --git a/tools/sched_ext/scx_layered/src/bpf/util.bpf.c b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c
new file mode 100644
index 0000000000000..703e0eece60b2
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c
@@ -0,0 +1,68 @@
+/* to be included in the main bpf.c file */
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	/* double size because verifier can't follow length calculation */
+	__uint(value_size, 2 * MAX_PATH);
+	__uint(max_entries, 1);
+} cgrp_path_bufs SEC(".maps");
+
+static char *format_cgrp_path(struct cgroup *cgrp)
+{
+	u32 zero = 0;
+	char *path = bpf_map_lookup_elem(&cgrp_path_bufs, &zero);
+	u32 len = 0, level, max_level;
+
+	if (!path) {
+		scx_bpf_error("cgrp_path_buf lookup failed");
+		return NULL;
+	}
+
+	max_level = cgrp->level;
+	if (max_level > 127)
+		max_level = 127;
+
+	bpf_for(level, 1, max_level + 1) {
+		int ret;
+
+		if (level > 1 && len < MAX_PATH - 1)
+			path[len++] = '/';
+
+		if (len >= MAX_PATH - 1) {
+			scx_bpf_error("cgrp_path_buf overflow");
+			return NULL;
+		}
+
+		ret = bpf_probe_read_kernel_str(path + len, MAX_PATH - len - 1,
+						BPF_CORE_READ(cgrp, ancestors[level], kn, name));
+		if (ret < 0) {
+			scx_bpf_error("bpf_probe_read_kernel_str failed");
+			return NULL;
+		}
+
+		len += ret - 1;
+	}
+
+	if (len >= MAX_PATH - 2) {
+		scx_bpf_error("cgrp_path_buf overflow");
+		return NULL;
+	}
+	path[len] = '/';
+	path[len + 1] = '\0';
+
+	return path;
+}
+
+static inline bool match_prefix(const char *prefix, const char *str, u32 max_len)
+{
+	int c;
+
+	bpf_for(c, 0, max_len) {
+		if (prefix[c] == '\0')
+			return true;
+		if (str[c] != prefix[c])
+			return false;
+	}
+	return false;
+}
diff --git a/tools/sched_ext/scx_layered/src/layered_sys.rs b/tools/sched_ext/scx_layered/src/layered_sys.rs
new file mode 100644
index 0000000000000..afc821d388d2c
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/layered_sys.rs
@@ -0,0 +1,10 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+
+include!(concat!(env!("OUT_DIR"), "/layered_sys.rs"));
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
new file mode 100644
index 0000000000000..838ddd2f6fbb0
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -0,0 +1,1635 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#[path = "bpf/.output/layered.skel.rs"]
+mod layered;
+pub use layered::*;
+pub mod layered_sys;
+
+use std::collections::BTreeMap;
+use std::collections::BTreeSet;
+use std::ffi::CStr;
+use std::ffi::CString;
+use std::fs;
+use std::io::Read;
+use std::io::Write;
+use std::ops::Sub;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+use std::time::Duration;
+use std::time::Instant;
+
+use ::fb_procfs as procfs;
+use anyhow::anyhow;
+use anyhow::bail;
+use anyhow::Context;
+use anyhow::Result;
+use bitvec::prelude::*;
+use clap::Parser;
+use libbpf_rs::skel::OpenSkel as _;
+use libbpf_rs::skel::Skel as _;
+use libbpf_rs::skel::SkelBuilder as _;
+use log::debug;
+use log::info;
+use log::trace;
+use serde::Deserialize;
+use serde::Serialize;
+
+const MAX_CPUS: usize = layered_sys::consts_MAX_CPUS as usize;
+const MAX_PATH: usize = layered_sys::consts_MAX_PATH as usize;
+const MAX_COMM: usize = layered_sys::consts_MAX_COMM as usize;
+const MAX_LAYER_MATCH_ORS: usize = layered_sys::consts_MAX_LAYER_MATCH_ORS as usize;
+const MAX_LAYERS: usize = layered_sys::consts_MAX_LAYERS as usize;
+const USAGE_HALF_LIFE: f64 = layered_sys::consts_USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
+const NR_GSTATS: usize = layered_sys::global_stat_idx_NR_GSTATS as usize;
+const NR_LSTATS: usize = layered_sys::layer_stat_idx_NR_LSTATS as usize;
+const NR_LAYER_MATCH_KINDS: usize = layered_sys::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
+const CORE_CACHE_LEVEL: u32 = 2;
+
+lazy_static::lazy_static! {
+    static ref NR_POSSIBLE_CPUS: usize = libbpf_rs::num_possible_cpus().unwrap();
+    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE as f64);
+}
+
+/// scx_layered: A highly configurable multi-layer sched_ext scheduler
+///
+/// scx_layered allows classifying tasks into multiple layers and applying
+/// different scheduling policies to them. The configuration is specified in
+/// json and composed of two parts - matches and policies.
+///
+/// Matches
+/// =======
+///
+/// Whenever a task is forked or its attributes are changed, the task goes
+/// through a series of matches to determine the layer it belongs to. A
+/// match set is composed of OR groups of AND blocks. An example:
+///
+///   "matches": [
+///     [
+///       {
+///         "CgroupPrefix": "system.slice/"
+///       }
+///     ],
+///     [
+///       {
+///         "CommPrefix": "fbagent"
+///       },
+///       {
+///         "NiceAbove": 0
+///       }
+///     ]
+///   ],
+///
+/// The outer array contains the OR groups and the inner AND blocks, so the
+/// above matches:
+///
+/// * Tasks which are in the cgroup sub-hierarchy under "system.slice".
+/// * Or tasks whose comm starts with "fbagent" and have a nice value > 0.
+///
+/// Currenlty, the following matches are supported:
+///
+/// * CgroupPrefix: Matches the prefix of the cgroup that the task belongs
+///   to. As this is a string match, whether the pattern has the trailing
+///   '/' makes difference. For example, "TOP/CHILD/" only matches tasks
+///   which are under that particular cgroup while "TOP/CHILD" also matches
+///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
+///
+/// * CommPrefix: Matches the task's comm prefix.
+///
+/// * NiceAbove: Matches if the task's nice value is greater than the
+///   pattern.
+///
+/// * NiceBelow: Matches if the task's nice value is smaller than the
+///   pattern.
+///
+/// While there are complexity limitations as the matches are performed in
+/// BPF, it is straight-forward to add more types of matches.
+///
+/// Policies
+/// ========
+///
+/// The following is an example policy configuration for a layer.
+///
+///   "kind": {
+///     "Confined": {
+///       "cpus_range": [1, 8],
+///       "util_range": [0.8, 0.9],
+///       ]
+///     }
+///   }
+///
+/// It's of "Confined" kind, which tries to concentrate the layer's tasks
+/// into a limited number of CPUs. In the above case, the number of CPUs
+/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
+/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
+/// than 90%, more CPUs are allocated to the layer. If the utilization drops
+/// below 80%, the layer loses CPUs.
+///
+/// Currently, the following policy kinds are supported:
+///
+/// * Confined: Tasks are restricted to the allocated CPUs. The number of
+///   CPUs allocated is modulated to keep the per-CPU utilization in
+///   "util_range". The range can optionally be restricted with the
+///   "cpus_range" property.
+///
+/// * Grouped: Similar to Confined but tasks may spill outside if there are
+///   idle CPUs outside the allocated ones. If "preempt" is true, tasks in
+///   this layer will preempt tasks which belong to other non-preempting
+///   layers when no idle CPUs are available.
+///
+/// * Open: Prefer the CPUs which are not occupied by Confined or Grouped
+///   layers. Tasks in this group will spill into occupied CPUs if there are
+///   no unoccupied idle CPUs. If "preempt" is true, tasks in this layer
+///   will preempt tasks which belong to other non-preempting layers when no
+///   idle CPUs are available.
+///
+/// Similar to matches, adding new policies and extending existing ones
+/// should be relatively straight-forward.
+///
+/// Configuration example and running scx_layered
+/// =============================================
+///
+/// A scx_layered config is composed of layer configs and a layer config is
+/// composed of a name, a set of matches and a policy block. Running the
+/// following will write an example configuration into example.json.
+///
+///   $ scx_layered -e example.json
+///
+/// Note that the last layer in the configuration must have an empty match
+/// set as it must match all tasks which haven't been matched into previous
+/// layers.
+///
+/// The configuration can be specified in multiple json files and command
+/// line arguments. Each must contain valid layer configurations and they're
+/// concatenated in the specified order. In most cases, something like the
+/// following should do.
+///
+///   $ scx_layered file:example.json
+///
+/// Statistics
+/// ==========
+///
+/// scx_layered will print out a set of statistics every monitoring
+/// interval.
+///
+///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 tctx_err=9 proc=6ms
+///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
+///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
+///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
+///                cpus=  2 [  2,  2] 04000001 00000000
+///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
+///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
+///                cpus= 50 [ 50, 50] fbfffffe 000fffff
+///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
+///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
+///                cpus= 50 [ 50, 50] fbfffffe 000fffff
+///
+/// Global statistics:
+///
+/// - tot: Total scheduling events in the period.
+///
+/// - local: % that got scheduled directly into an idle CPU.
+///
+/// - open_idle: % of open layer tasks scheduled into occupied idle CPUs.
+///
+/// - affn_viol: % which violated configured policies due to CPU affinity
+///   restrictions.
+///
+/// - proc: CPU time this binary consumed during the period.
+///
+/// - busy: CPU busy % (100% means all CPUs were fully occupied)
+///
+/// - util: CPU utilization % (100% means one CPU was fully occupied)
+///
+/// - load: Sum of weight * duty_cycle for all tasks
+///
+/// Per-layer statistics:
+///
+/// - util/frac: CPU utilization and fraction % (sum of fractions across
+///   layers is always 100%).
+///
+/// - load/frac: Load sum and fraction %.
+///
+/// - tasks: Number of tasks.
+///
+/// - tot: Total scheduling events.
+///
+/// - open_idle: % of tasks scheduled into idle CPUs occupied by other layers.
+///
+/// - preempt: % of tasks that preempted other tasks.
+///
+/// - affn_viol: % which violated configured policies due to CPU affinity
+///   restrictions.
+///
+/// - cpus: CUR_NR_CPUS [MIN_NR_CPUS, MAX_NR_CPUS] CUR_CPU_MASK
+///
+#[derive(Debug, Parser)]
+#[command(verbatim_doc_comment)]
+struct Opts {
+    /// Scheduling slice duration in microseconds.
+    #[clap(short = 's', long, default_value = "20000")]
+    slice_us: u64,
+
+    /// Scheduling interval in seconds.
+    #[clap(short = 'i', long, default_value = "0.1")]
+    interval: f64,
+
+    /// Monitoring interval in seconds.
+    #[clap(short = 'm', long, default_value = "2.0")]
+    monitor: f64,
+
+    /// Disable load-fraction based max layer CPU limit. ***NOTE***
+    /// load-fraction calculation is currently broken due to lack of
+    /// infeasible weight adjustments. Setting this option is recommended.
+    #[clap(short = 'n', long)]
+    no_load_frac_limit: bool,
+
+    /// Enable verbose output including libbpf details. Specify multiple
+    /// times to increase verbosity.
+    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
+    verbose: u8,
+
+    /// Write example layer specifications into the file and exit.
+    #[clap(short = 'e', long)]
+    example: Option<String>,
+
+    /// Layer specification. An argument should be a string containing one
+    /// specification.
+    ///
+    /// Prefix of cgroup paths whose tasks are in the batch execution layer.
+    /// Tasks in this layer will get the weight-matching CPU cycles but may
+    /// experience higher scheduling latencies.
+    ///
+    /// The paths don't have the leading '/' and may or may not have trailing
+    /// '/'. If there is no trailing '/', the prefix matches any cgroups
+    /// which have matching prefix upto that point.
+    ///
+    /// - "" matches all cgroups.
+    /// - "/" only matches the root cgroup.
+    /// - "workload" matches both "workload/work" and "workload-1/work".
+    /// - "workload/" matches "workload/work" but not "workload-1/work".
+    specs: Vec<String>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+enum LayerMatch {
+    CgroupPrefix(String),
+    CommPrefix(String),
+    NiceAbove(i32),
+    NiceBelow(i32),
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+enum LayerKind {
+    Confined {
+        cpus_range: Option<(usize, usize)>,
+        util_range: (f64, f64),
+    },
+    Grouped {
+        cpus_range: Option<(usize, usize)>,
+        util_range: (f64, f64),
+        preempt: bool,
+    },
+    Open {
+        preempt: bool,
+    },
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+struct LayerSpec {
+    name: String,
+    comment: Option<String>,
+    matches: Vec<Vec<LayerMatch>>,
+    kind: LayerKind,
+}
+
+impl LayerSpec {
+    fn parse(input: &str) -> Result<Vec<Self>> {
+        let config: LayerConfig = if input.starts_with("f:") || input.starts_with("file:") {
+            let mut f = fs::OpenOptions::new()
+                .read(true)
+                .open(input.split_once(':').unwrap().1)?;
+            let mut content = String::new();
+            f.read_to_string(&mut content)?;
+            serde_json::from_str(&content)?
+        } else {
+            serde_json::from_str(input)?
+        };
+        Ok(config.specs)
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(transparent)]
+struct LayerConfig {
+    specs: Vec<LayerSpec>,
+}
+
+fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
+    reader
+        .read_stat()
+        .context("Failed to read procfs")?
+        .total_cpu
+        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
+}
+
+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
+    match (curr, prev) {
+        (
+            procfs::CpuStat {
+                user_usec: Some(curr_user),
+                nice_usec: Some(curr_nice),
+                system_usec: Some(curr_system),
+                idle_usec: Some(curr_idle),
+                iowait_usec: Some(curr_iowait),
+                irq_usec: Some(curr_irq),
+                softirq_usec: Some(curr_softirq),
+                stolen_usec: Some(curr_stolen),
+                ..
+            },
+            procfs::CpuStat {
+                user_usec: Some(prev_user),
+                nice_usec: Some(prev_nice),
+                system_usec: Some(prev_system),
+                idle_usec: Some(prev_idle),
+                iowait_usec: Some(prev_iowait),
+                irq_usec: Some(prev_irq),
+                softirq_usec: Some(prev_softirq),
+                stolen_usec: Some(prev_stolen),
+                ..
+            },
+        ) => {
+            let idle_usec = curr_idle - prev_idle;
+            let iowait_usec = curr_iowait - prev_iowait;
+            let user_usec = curr_user - prev_user;
+            let system_usec = curr_system - prev_system;
+            let nice_usec = curr_nice - prev_nice;
+            let irq_usec = curr_irq - prev_irq;
+            let softirq_usec = curr_softirq - prev_softirq;
+            let stolen_usec = curr_stolen - prev_stolen;
+
+            let busy_usec =
+                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
+            let total_usec = idle_usec + busy_usec + iowait_usec;
+            if total_usec > 0 {
+                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
+            } else {
+                Ok(1.0)
+            }
+        }
+        _ => {
+            bail!("Missing stats in cpustat");
+        }
+    }
+}
+
+fn copy_into_cstr(dst: &mut [i8], src: &str) {
+    let cstr = CString::new(src).unwrap();
+    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
+    dst[0..bytes.len()].copy_from_slice(bytes);
+}
+
+fn format_bitvec(bitvec: &BitVec) -> String {
+    let mut vals = Vec::<u32>::new();
+    let mut val: u32 = 0;
+    for (idx, bit) in bitvec.iter().enumerate() {
+        if idx > 0 && idx % 32 == 0 {
+            vals.push(val);
+            val = 0;
+        }
+        if *bit {
+            val |= 1 << (idx % 32);
+        }
+    }
+    vals.push(val);
+    let mut output = vals
+        .iter()
+        .fold(String::new(), |string, v| format!("{}{:08x} ", string, v));
+    output.pop();
+    output
+}
+
+fn read_cpu_ctxs(skel: &LayeredSkel) -> Result<Vec<layered_sys::cpu_ctx>> {
+    let mut cpu_ctxs = vec![];
+    let cpu_ctxs_vec = skel
+        .maps()
+        .cpu_ctxs()
+        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
+        .context("Failed to lookup cpu_ctx")?
+        .unwrap();
+    for cpu in 0..*NR_POSSIBLE_CPUS {
+        cpu_ctxs.push(*unsafe {
+            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const layered_sys::cpu_ctx)
+        });
+    }
+    Ok(cpu_ctxs)
+}
+
+#[derive(Clone, Debug)]
+struct BpfStats {
+    gstats: Vec<u64>,
+    lstats: Vec<Vec<u64>>,
+    lstats_sums: Vec<u64>,
+}
+
+impl BpfStats {
+    fn read(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Self {
+        let mut gstats = vec![0u64; NR_GSTATS];
+        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
+
+        for cpu in 0..*NR_POSSIBLE_CPUS {
+            for stat in 0..NR_GSTATS {
+                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
+            }
+            for layer in 0..nr_layers {
+                for stat in 0..NR_LSTATS {
+                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
+                }
+            }
+        }
+
+        let mut lstats_sums = vec![0u64; NR_LSTATS];
+        for layer in 0..nr_layers {
+            for stat in 0..NR_LSTATS {
+                lstats_sums[stat] += lstats[layer][stat];
+            }
+        }
+
+        Self {
+            gstats,
+            lstats,
+            lstats_sums,
+        }
+    }
+}
+
+impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
+    type Output = BpfStats;
+
+    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
+        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
+        BpfStats {
+            gstats: vec_sub(&self.gstats, &rhs.gstats),
+            lstats: self
+                .lstats
+                .iter()
+                .zip(rhs.lstats.iter())
+                .map(|(l, r)| vec_sub(l, r))
+                .collect(),
+            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
+        }
+    }
+}
+
+struct Stats {
+    nr_layers: usize,
+    at: Instant,
+
+    nr_layer_tasks: Vec<usize>,
+
+    total_load: f64,
+    layer_loads: Vec<f64>,
+
+    total_util: f64, // Running AVG of sum of layer_utils
+    layer_utils: Vec<f64>,
+    prev_layer_cycles: Vec<u64>,
+
+    cpu_busy: f64, // Read from /proc, maybe higher than total_util
+    prev_total_cpu: procfs::CpuStat,
+
+    bpf_stats: BpfStats,
+    prev_bpf_stats: BpfStats,
+}
+
+impl Stats {
+    fn read_layer_loads(skel: &mut LayeredSkel, nr_layers: usize) -> (f64, Vec<f64>) {
+        let one = skel.rodata().ravg_1;
+        let layer_loads: Vec<f64> = skel
+            .bss()
+            .layers
+            .iter()
+            .take(nr_layers)
+            .map(|layer| layer.load_avg as f64 / one as f64)
+            .collect();
+        (layer_loads.iter().sum(), layer_loads)
+    }
+
+    fn read_layer_cycles(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Vec<u64> {
+        let mut layer_cycles = vec![0u64; nr_layers];
+
+        for cpu in 0..*NR_POSSIBLE_CPUS {
+            for layer in 0..nr_layers {
+                layer_cycles[layer] += cpu_ctxs[cpu].layer_cycles[layer];
+            }
+        }
+
+        layer_cycles
+    }
+
+    fn new(skel: &mut LayeredSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
+        let nr_layers = skel.rodata().nr_layers as usize;
+        let bpf_stats = BpfStats::read(&read_cpu_ctxs(skel)?, nr_layers);
+
+        Ok(Self {
+            at: Instant::now(),
+            nr_layers,
+
+            nr_layer_tasks: vec![0; nr_layers],
+
+            total_load: 0.0,
+            layer_loads: vec![0.0; nr_layers],
+
+            total_util: 0.0,
+            layer_utils: vec![0.0; nr_layers],
+            prev_layer_cycles: vec![0; nr_layers],
+
+            cpu_busy: 0.0,
+            prev_total_cpu: read_total_cpu(&proc_reader)?,
+
+            bpf_stats: bpf_stats.clone(),
+            prev_bpf_stats: bpf_stats,
+        })
+    }
+
+    fn refresh(
+        &mut self,
+        skel: &mut LayeredSkel,
+        proc_reader: &procfs::ProcReader,
+        now: Instant,
+    ) -> Result<()> {
+        let elapsed = now.duration_since(self.at).as_secs_f64() as f64;
+        let cpu_ctxs = read_cpu_ctxs(skel)?;
+
+        let nr_layer_tasks: Vec<usize> = skel
+            .bss()
+            .layers
+            .iter()
+            .take(self.nr_layers)
+            .map(|layer| layer.nr_tasks as usize)
+            .collect();
+
+        let (total_load, layer_loads) = Self::read_layer_loads(skel, self.nr_layers);
+
+        let cur_layer_cycles = Self::read_layer_cycles(&cpu_ctxs, self.nr_layers);
+        let cur_layer_utils: Vec<f64> = cur_layer_cycles
+            .iter()
+            .zip(self.prev_layer_cycles.iter())
+            .map(|(cur, prev)| (cur - prev) as f64 / 1_000_000_000.0 / elapsed)
+            .collect();
+        let layer_utils: Vec<f64> = cur_layer_utils
+            .iter()
+            .zip(self.layer_utils.iter())
+            .map(|(cur, prev)| {
+                let decay = USAGE_DECAY.powf(elapsed);
+                prev * decay + cur * (1.0 - decay)
+            })
+            .collect();
+
+        let cur_total_cpu = read_total_cpu(proc_reader)?;
+        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
+
+        let cur_bpf_stats = BpfStats::read(&cpu_ctxs, self.nr_layers);
+        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
+
+        *self = Self {
+            at: now,
+            nr_layers: self.nr_layers,
+
+            nr_layer_tasks,
+
+            total_load,
+            layer_loads,
+
+            total_util: layer_utils.iter().sum(),
+            layer_utils: layer_utils.try_into().unwrap(),
+            prev_layer_cycles: cur_layer_cycles,
+
+            cpu_busy,
+            prev_total_cpu: cur_total_cpu,
+
+            bpf_stats,
+            prev_bpf_stats: cur_bpf_stats,
+        };
+        Ok(())
+    }
+}
+
+#[derive(Debug, Default)]
+struct UserExitInfo {
+    exit_type: i32,
+    reason: Option<String>,
+    msg: Option<String>,
+}
+
+impl UserExitInfo {
+    fn read(bpf_uei: &layered_bss_types::user_exit_info) -> Result<Self> {
+        let exit_type = unsafe { std::ptr::read_volatile(&bpf_uei.exit_type as *const _) };
+
+        let (reason, msg) = if exit_type != 0 {
+            (
+                Some(
+                    unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) }
+                        .to_str()
+                        .context("Failed to convert reason to string")?
+                        .to_string(),
+                )
+                .filter(|s| !s.is_empty()),
+                Some(
+                    unsafe { CStr::from_ptr(bpf_uei.msg.as_ptr() as *const _) }
+                        .to_str()
+                        .context("Failed to convert msg to string")?
+                        .to_string(),
+                )
+                .filter(|s| !s.is_empty()),
+            )
+        } else {
+            (None, None)
+        };
+
+        Ok(Self {
+            exit_type,
+            reason,
+            msg,
+        })
+    }
+
+    fn exited(bpf_uei: &layered_bss_types::user_exit_info) -> Result<bool> {
+        Ok(Self::read(bpf_uei)?.exit_type != 0)
+    }
+
+    fn report(&self) -> Result<()> {
+        let why = match (&self.reason, &self.msg) {
+            (Some(reason), None) => format!("{}", reason),
+            (Some(reason), Some(msg)) => format!("{} ({})", reason, msg),
+            _ => "".into(),
+        };
+
+        match self.exit_type {
+            0 => Ok(()),
+            etype => {
+                if etype != 64 {
+                    bail!("BPF exit_type={} {}", etype, why);
+                } else {
+                    info!("EXIT: {}", why);
+                    Ok(())
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+struct CpuPool {
+    nr_cores: usize,
+    nr_cpus: usize,
+    all_cpus: BitVec,
+    core_cpus: Vec<BitVec>,
+    cpu_core: Vec<usize>,
+    available_cores: BitVec,
+    first_cpu: usize,
+    fallback_cpu: usize, // next free or the first CPU if none is free
+}
+
+impl CpuPool {
+    fn new() -> Result<Self> {
+        if *NR_POSSIBLE_CPUS > MAX_CPUS {
+            bail!(
+                "NR_POSSIBLE_CPUS {} > MAX_CPUS {}",
+                *NR_POSSIBLE_CPUS,
+                MAX_CPUS
+            );
+        }
+
+        let mut cpu_to_cache = vec![]; // (cpu_id, Option<cache_id>)
+        let mut cache_ids = BTreeSet::<usize>::new();
+        let mut nr_offline = 0;
+
+        // Build cpu -> cache ID mapping.
+        for cpu in 0..*NR_POSSIBLE_CPUS {
+            let path = format!(
+                "/sys/devices/system/cpu/cpu{}/cache/index{}/id",
+                cpu, CORE_CACHE_LEVEL
+            );
+            let id = match std::fs::read_to_string(&path) {
+                Ok(val) => Some(val.trim().parse::<usize>().with_context(|| {
+                    format!("Failed to parse {:?}'s content {:?}", &path, &val)
+                })?),
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    nr_offline += 1;
+                    None
+                }
+                Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
+            };
+
+            cpu_to_cache.push(id);
+            if let Some(id) = id {
+                cache_ids.insert(id);
+            }
+        }
+
+        let nr_cpus = *NR_POSSIBLE_CPUS - nr_offline;
+
+        // Cache IDs may have holes. Assign consecutive core IDs to existing
+        // cache IDs.
+        let mut cache_to_core = BTreeMap::<usize, usize>::new();
+        let mut nr_cores = 0;
+        for cache_id in cache_ids.iter() {
+            cache_to_core.insert(*cache_id, nr_cores);
+            nr_cores += 1;
+        }
+
+        // Build core -> cpumask and cpu -> core mappings.
+        let mut all_cpus = bitvec![0; *NR_POSSIBLE_CPUS];
+        let mut core_cpus = vec![bitvec![0; *NR_POSSIBLE_CPUS]; nr_cores];
+        let mut cpu_core = vec![];
+
+        for (cpu, cache) in cpu_to_cache.iter().enumerate().take(*NR_POSSIBLE_CPUS) {
+            if let Some(cache_id) = cache {
+                let core_id = cache_to_core[cache_id];
+                all_cpus.set(cpu, true);
+                core_cpus[core_id].set(cpu, true);
+                cpu_core.push(core_id);
+            }
+        }
+
+        info!(
+            "CPUs: online/possible={}/{} nr_cores={}",
+            nr_cpus, *NR_POSSIBLE_CPUS, nr_cores,
+        );
+
+        let first_cpu = core_cpus[0].first_one().unwrap();
+
+        let mut cpu_pool = Self {
+            nr_cores,
+            nr_cpus,
+            all_cpus,
+            core_cpus,
+            cpu_core,
+            available_cores: bitvec![1; nr_cores],
+            first_cpu,
+            fallback_cpu: first_cpu,
+        };
+        cpu_pool.update_fallback_cpu();
+        Ok(cpu_pool)
+    }
+
+    fn update_fallback_cpu(&mut self) {
+        match self.available_cores.first_one() {
+            Some(next) => self.fallback_cpu = self.core_cpus[next].first_one().unwrap(),
+            None => self.fallback_cpu = self.first_cpu,
+        }
+    }
+
+    fn alloc<'a>(&'a mut self) -> Option<&'a BitVec> {
+        let core = self.available_cores.first_one()?;
+        self.available_cores.set(core, false);
+        self.update_fallback_cpu();
+        Some(&self.core_cpus[core])
+    }
+
+    fn cpus_to_cores(&self, cpus_to_match: &BitVec) -> Result<BitVec> {
+        let mut cpus = cpus_to_match.clone();
+        let mut cores = bitvec![0; self.nr_cores];
+
+        while let Some(cpu) = cpus.first_one() {
+            let core = self.cpu_core[cpu];
+
+            if (self.core_cpus[core].clone() & !cpus.clone()).count_ones() != 0 {
+                bail!(
+                    "CPUs {} partially intersect with core {} ({})",
+                    cpus_to_match,
+                    core,
+                    self.core_cpus[core],
+                );
+            }
+
+            cpus &= !self.core_cpus[core].clone();
+            cores.set(core, true);
+        }
+
+        Ok(cores)
+    }
+
+    fn free<'a>(&'a mut self, cpus_to_free: &BitVec) -> Result<()> {
+        let cores = self.cpus_to_cores(cpus_to_free)?;
+        if (self.available_cores.clone() & &cores).any() {
+            bail!("Some of CPUs {} are already free", cpus_to_free);
+        }
+        self.available_cores |= cores;
+        self.update_fallback_cpu();
+        Ok(())
+    }
+
+    fn next_to_free<'a>(&'a self, cands: &BitVec) -> Result<Option<&'a BitVec>> {
+        let last = match cands.last_one() {
+            Some(ret) => ret,
+            None => return Ok(None),
+        };
+        let core = self.cpu_core[last];
+        if (self.core_cpus[core].clone() & !cands.clone()).count_ones() != 0 {
+            bail!(
+                "CPUs{} partially intersect with core {} ({})",
+                cands,
+                core,
+                self.core_cpus[core]
+            );
+        }
+
+        Ok(Some(&self.core_cpus[core]))
+    }
+
+    fn available_cpus(&self) -> BitVec {
+        let mut cpus = bitvec![0; self.nr_cpus];
+        for core in self.available_cores.iter_ones() {
+            cpus |= &self.core_cpus[core];
+        }
+        cpus
+    }
+}
+
+#[derive(Debug)]
+struct Layer {
+    name: String,
+    kind: LayerKind,
+
+    nr_cpus: usize,
+    cpus: BitVec,
+}
+
+impl Layer {
+    fn new(cpu_pool: &mut CpuPool, name: &str, kind: LayerKind) -> Result<Self> {
+        match &kind {
+            LayerKind::Confined {
+                cpus_range,
+                util_range,
+            } => {
+                let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX));
+                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
+                    bail!("invalid cpus_range {:?}", cpus_range);
+                }
+                if util_range.0 < 0.0
+                    || util_range.0 > 1.0
+                    || util_range.1 < 0.0
+                    || util_range.1 > 1.0
+                    || util_range.0 >= util_range.1
+                {
+                    bail!("invalid util_range {:?}", util_range);
+                }
+            }
+            _ => {}
+        }
+
+        let nr_cpus = cpu_pool.nr_cpus;
+
+        let mut layer = Self {
+            name: name.into(),
+            kind,
+
+            nr_cpus: 0,
+            cpus: bitvec![0; nr_cpus],
+        };
+
+        match &layer.kind {
+            LayerKind::Confined {
+                cpus_range,
+                util_range,
+            }
+            | LayerKind::Grouped {
+                cpus_range,
+                util_range,
+                ..
+            } => {
+                layer.resize_confined_or_grouped(
+                    cpu_pool,
+                    *cpus_range,
+                    *util_range,
+                    (0.0, 0.0),
+                    (0.0, 0.0),
+                    false,
+                )?;
+            }
+            _ => {}
+        }
+
+        Ok(layer)
+    }
+
+    fn grow_confined_or_grouped(
+        &mut self,
+        cpu_pool: &mut CpuPool,
+        (cpus_min, cpus_max): (usize, usize),
+        (_util_low, util_high): (f64, f64),
+        (layer_load, total_load): (f64, f64),
+        (layer_util, _total_util): (f64, f64),
+        no_load_frac_limit: bool,
+    ) -> Result<bool> {
+        if self.nr_cpus >= cpus_max {
+            return Ok(false);
+        }
+
+        // Do we already have enough?
+        if self.nr_cpus >= cpus_min
+            && (layer_util == 0.0
+                || (self.nr_cpus > 0 && layer_util / self.nr_cpus as f64 <= util_high))
+        {
+            return Ok(false);
+        }
+
+        // Can't have more CPUs than our load fraction.
+        if !no_load_frac_limit
+            && self.nr_cpus >= cpus_min
+            && (total_load >= 0.0
+                && self.nr_cpus as f64 / cpu_pool.nr_cpus as f64 >= layer_load / total_load)
+        {
+            trace!(
+                "layer-{} needs more CPUs (util={:.3}) but is over the load fraction",
+                &self.name,
+                layer_util
+            );
+            return Ok(false);
+        }
+
+        let new_cpus = match cpu_pool.alloc().clone() {
+            Some(ret) => ret.clone(),
+            None => {
+                trace!("layer-{} can't grow, no CPUs", &self.name);
+                return Ok(false);
+            }
+        };
+
+        trace!(
+            "layer-{} adding {} CPUs to {} CPUs",
+            &self.name,
+            new_cpus.count_ones(),
+            self.nr_cpus
+        );
+
+        self.nr_cpus += new_cpus.count_ones();
+        self.cpus |= &new_cpus;
+        Ok(true)
+    }
+
+    fn cpus_to_free(
+        &self,
+        cpu_pool: &mut CpuPool,
+        (cpus_min, _cpus_max): (usize, usize),
+        (util_low, util_high): (f64, f64),
+        (layer_load, total_load): (f64, f64),
+        (layer_util, _total_util): (f64, f64),
+        no_load_frac_limit: bool,
+    ) -> Result<Option<BitVec>> {
+        if self.nr_cpus <= cpus_min {
+            return Ok(None);
+        }
+
+        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus)? {
+            Some(ret) => ret.clone(),
+            None => return Ok(None),
+        };
+
+        let nr_to_free = cpus_to_free.count_ones();
+
+        // If we'd be over the load fraction even after freeing
+        // $cpus_to_free, we have to free.
+        if !no_load_frac_limit
+            && total_load >= 0.0
+            && (self.nr_cpus - nr_to_free) as f64 / cpu_pool.nr_cpus as f64
+                >= layer_load / total_load
+        {
+            return Ok(Some(cpus_to_free));
+        }
+
+        if layer_util / self.nr_cpus as f64 >= util_low {
+            return Ok(None);
+        }
+
+        // Can't shrink if losing the CPUs pushes us over @util_high.
+        match self.nr_cpus - nr_to_free {
+            0 => {
+                if layer_util > 0.0 {
+                    return Ok(None);
+                }
+            }
+            nr_left => {
+                if layer_util / nr_left as f64 >= util_high {
+                    return Ok(None);
+                }
+            }
+        }
+
+        return Ok(Some(cpus_to_free));
+    }
+
+    fn shrink_confined_or_grouped(
+        &mut self,
+        cpu_pool: &mut CpuPool,
+        cpus_range: (usize, usize),
+        util_range: (f64, f64),
+        load: (f64, f64),
+        util: (f64, f64),
+        no_load_frac_limit: bool,
+    ) -> Result<bool> {
+        match self.cpus_to_free(
+            cpu_pool,
+            cpus_range,
+            util_range,
+            load,
+            util,
+            no_load_frac_limit,
+        )? {
+            Some(cpus_to_free) => {
+                trace!("freeing CPUs {}", &cpus_to_free);
+                self.nr_cpus -= cpus_to_free.count_ones();
+                self.cpus &= !cpus_to_free.clone();
+                cpu_pool.free(&cpus_to_free)?;
+                Ok(true)
+            }
+            None => Ok(false),
+        }
+    }
+
+    fn resize_confined_or_grouped(
+        &mut self,
+        cpu_pool: &mut CpuPool,
+        cpus_range: Option<(usize, usize)>,
+        util_range: (f64, f64),
+        load: (f64, f64),
+        util: (f64, f64),
+        no_load_frac_limit: bool,
+    ) -> Result<i64> {
+        let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX));
+        let mut adjusted = 0;
+
+        while self.grow_confined_or_grouped(
+            cpu_pool,
+            cpus_range,
+            util_range,
+            load,
+            util,
+            no_load_frac_limit,
+        )? {
+            adjusted += 1;
+            trace!("{} grew, adjusted={}", &self.name, adjusted);
+        }
+
+        if adjusted == 0 {
+            while self.shrink_confined_or_grouped(
+                cpu_pool,
+                cpus_range,
+                util_range,
+                load,
+                util,
+                no_load_frac_limit,
+            )? {
+                adjusted -= 1;
+                trace!("{} shrunk, adjusted={}", &self.name, adjusted);
+            }
+        }
+
+        if adjusted != 0 {
+            trace!("{} done resizing, adjusted={}", &self.name, adjusted);
+        }
+        Ok(adjusted)
+    }
+}
+
+struct Scheduler<'a> {
+    skel: LayeredSkel<'a>,
+    struct_ops: Option<libbpf_rs::Link>,
+    layer_specs: Vec<LayerSpec>,
+
+    sched_intv: Duration,
+    monitor_intv: Duration,
+    no_load_frac_limit: bool,
+
+    cpu_pool: CpuPool,
+    layers: Vec<Layer>,
+
+    proc_reader: procfs::ProcReader,
+    sched_stats: Stats,
+    report_stats: Stats,
+
+    nr_layer_cpus_min_max: Vec<(usize, usize)>,
+    processing_dur: Duration,
+    prev_processing_dur: Duration,
+}
+
+impl<'a> Scheduler<'a> {
+    fn init_layers(skel: &mut OpenLayeredSkel, specs: &Vec<LayerSpec>) -> Result<()> {
+        skel.rodata().nr_layers = specs.len() as u32;
+
+        for (spec_i, spec) in specs.iter().enumerate() {
+            let layer = &mut skel.bss().layers[spec_i];
+
+            for (or_i, or) in spec.matches.iter().enumerate() {
+                for (and_i, and) in or.iter().enumerate() {
+                    let mt = &mut layer.matches[or_i].matches[and_i];
+                    match and {
+                        LayerMatch::CgroupPrefix(prefix) => {
+                            mt.kind = layered_sys::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
+                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
+                        }
+                        LayerMatch::CommPrefix(prefix) => {
+                            mt.kind = layered_sys::layer_match_kind_MATCH_COMM_PREFIX as i32;
+                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
+                        }
+                        LayerMatch::NiceAbove(nice) => {
+                            mt.kind = layered_sys::layer_match_kind_MATCH_NICE_ABOVE as i32;
+                            mt.nice_above_or_below = *nice;
+                        }
+                        LayerMatch::NiceBelow(nice) => {
+                            mt.kind = layered_sys::layer_match_kind_MATCH_NICE_BELOW as i32;
+                            mt.nice_above_or_below = *nice;
+                        }
+                    }
+                }
+                layer.matches[or_i].nr_match_ands = or.len() as i32;
+            }
+
+            layer.nr_match_ors = spec.matches.len() as u32;
+
+            match &spec.kind {
+                LayerKind::Open { preempt } | LayerKind::Grouped { preempt, .. } => {
+                    layer.open = true;
+                    layer.preempt = *preempt;
+                }
+                _ => {}
+            }
+        }
+
+        Ok(())
+    }
+
+    fn init(opts: &Opts, layer_specs: Vec<LayerSpec>) -> Result<Self> {
+        let nr_layers = layer_specs.len();
+        let mut cpu_pool = CpuPool::new()?;
+
+        // Open the BPF prog first for verification.
+        let mut skel_builder = LayeredSkelBuilder::default();
+        skel_builder.obj_builder.debug(opts.verbose > 1);
+        let mut skel = skel_builder.open().context("Failed to open BPF program")?;
+
+        // Initialize skel according to @opts.
+        skel.rodata().debug = opts.verbose as u32;
+        skel.rodata().slice_ns = opts.slice_us * 1000;
+        skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
+        skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
+        for cpu in cpu_pool.all_cpus.iter_ones() {
+            skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
+        }
+        Self::init_layers(&mut skel, &layer_specs)?;
+
+        // Attach.
+        let mut skel = skel.load().context("Failed to load BPF program")?;
+        skel.attach().context("Failed to attach BPF program")?;
+        let struct_ops = Some(
+            skel.maps_mut()
+                .layered()
+                .attach_struct_ops()
+                .context("Failed to attach layered struct ops")?,
+        );
+        info!("Layered Scheduler Attached");
+
+        let mut layers = vec![];
+        for spec in layer_specs.iter() {
+            layers.push(Layer::new(&mut cpu_pool, &spec.name, spec.kind.clone())?);
+        }
+
+        // Other stuff.
+        let proc_reader = procfs::ProcReader::new();
+
+        Ok(Self {
+            struct_ops, // should be held to keep it attached
+            layer_specs,
+
+            sched_intv: Duration::from_secs_f64(opts.interval),
+            monitor_intv: Duration::from_secs_f64(opts.monitor),
+            no_load_frac_limit: opts.no_load_frac_limit,
+
+            cpu_pool,
+            layers,
+
+            sched_stats: Stats::new(&mut skel, &proc_reader)?,
+            report_stats: Stats::new(&mut skel, &proc_reader)?,
+
+            nr_layer_cpus_min_max: vec![(0, 0); nr_layers],
+            processing_dur: Duration::from_millis(0),
+            prev_processing_dur: Duration::from_millis(0),
+
+            proc_reader,
+            skel,
+        })
+    }
+
+    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut layered_bss_types::layer) {
+        for bit in 0..layer.cpus.len() {
+            if layer.cpus[bit] {
+                bpf_layer.cpus[bit / 8] |= 1 << (bit % 8);
+            } else {
+                bpf_layer.cpus[bit / 8] &= !(1 << (bit % 8));
+            }
+        }
+        bpf_layer.refresh_cpus = 1;
+    }
+
+    fn step(&mut self) -> Result<()> {
+        let started_at = Instant::now();
+        self.sched_stats
+            .refresh(&mut self.skel, &self.proc_reader, started_at)?;
+        let mut updated = false;
+
+        for idx in 0..self.layers.len() {
+            match self.layers[idx].kind {
+                LayerKind::Confined {
+                    cpus_range,
+                    util_range,
+                }
+                | LayerKind::Grouped {
+                    cpus_range,
+                    util_range,
+                    ..
+                } => {
+                    let load = (
+                        self.sched_stats.layer_loads[idx],
+                        self.sched_stats.total_load,
+                    );
+                    let util = (
+                        self.sched_stats.layer_utils[idx],
+                        self.sched_stats.total_util,
+                    );
+                    if self.layers[idx].resize_confined_or_grouped(
+                        &mut self.cpu_pool,
+                        cpus_range,
+                        util_range,
+                        load,
+                        util,
+                        self.no_load_frac_limit,
+                    )? != 0
+                    {
+                        Self::update_bpf_layer_cpumask(
+                            &self.layers[idx],
+                            &mut self.skel.bss().layers[idx],
+                        );
+                        updated = true;
+                    }
+                }
+                _ => {}
+            }
+        }
+
+        if updated {
+            let available_cpus = self.cpu_pool.available_cpus();
+            let nr_available_cpus = available_cpus.count_ones();
+            for idx in 0..self.layers.len() {
+                let layer = &mut self.layers[idx];
+                let bpf_layer = &mut self.skel.bss().layers[idx];
+                match &layer.kind {
+                    LayerKind::Open { .. } => {
+                        layer.cpus.copy_from_bitslice(&available_cpus);
+                        layer.nr_cpus = nr_available_cpus;
+                        Self::update_bpf_layer_cpumask(layer, bpf_layer);
+                    }
+                    _ => {}
+                }
+            }
+
+            self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
+
+            for (lidx, layer) in self.layers.iter().enumerate() {
+                self.nr_layer_cpus_min_max[lidx] = (
+                    self.nr_layer_cpus_min_max[lidx].0.min(layer.nr_cpus),
+                    self.nr_layer_cpus_min_max[lidx].1.max(layer.nr_cpus),
+                );
+            }
+        }
+
+        self.processing_dur += Instant::now().duration_since(started_at);
+        Ok(())
+    }
+
+    fn report(&mut self) -> Result<()> {
+        let started_at = Instant::now();
+        self.report_stats
+            .refresh(&mut self.skel, &self.proc_reader, started_at)?;
+        let stats = &self.report_stats;
+
+        let processing_dur = self.processing_dur - self.prev_processing_dur;
+        self.prev_processing_dur = self.processing_dur;
+
+        let lsum = |idx| stats.bpf_stats.lstats_sums[idx as usize];
+        let total = lsum(layered_sys::layer_stat_idx_LSTAT_LOCAL)
+            + lsum(layered_sys::layer_stat_idx_LSTAT_GLOBAL);
+        let lsum_pct = |idx| {
+            if total != 0 {
+                lsum(idx) as f64 / total as f64 * 100.0
+            } else {
+                0.0
+            }
+        };
+
+        info!(
+            "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms",
+            total,
+            lsum_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL),
+            lsum_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE),
+            lsum_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL),
+            stats.prev_bpf_stats.gstats
+                [layered_sys::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize],
+            processing_dur.as_millis(),
+        );
+
+        info!(
+            "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}",
+            stats.cpu_busy * 100.0,
+            stats.total_util * 100.0,
+            stats.total_load,
+            self.cpu_pool.fallback_cpu,
+        );
+
+        let header_width = self
+            .layer_specs
+            .iter()
+            .map(|spec| spec.name.len())
+            .max()
+            .unwrap()
+            .max(4);
+
+        let calc_frac = |a, b| {
+            if b != 0.0 { a / b * 100.0 } else { 0.0 }
+        };
+
+        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
+            let lstat = |sidx| stats.bpf_stats.lstats[lidx][sidx as usize];
+            let ltotal = lstat(layered_sys::layer_stat_idx_LSTAT_LOCAL)
+                + lstat(layered_sys::layer_stat_idx_LSTAT_GLOBAL);
+            let lstat_pct = |sidx| {
+                if ltotal != 0 {
+                    lstat(sidx) as f64 / ltotal as f64 * 100.0
+                } else {
+                    0.0
+                }
+            };
+
+            info!(
+                "  {:<width$}: util/frac={:7.1}/{:5.1} load/frac={:9.1}:{:5.1} tasks={:6}",
+                spec.name,
+                stats.layer_utils[lidx] * 100.0,
+                calc_frac(stats.layer_utils[lidx], stats.total_util),
+                stats.layer_loads[lidx],
+                calc_frac(stats.layer_loads[lidx], stats.total_load),
+                stats.nr_layer_tasks[lidx],
+                width = header_width,
+            );
+            info!(
+                "  {:<width$}  tot={:7} local={:5.2} open_idle={:5.2} preempt={:5.2} affn_viol={:5.2}",
+                "",
+                ltotal,
+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL),
+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE),
+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_PREEMPT),
+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL),
+                width = header_width,
+            );
+            info!(
+                "  {:<width$}  cpus={:3} [{:3},{:3}] {}",
+                "",
+                layer.nr_cpus,
+                self.nr_layer_cpus_min_max[lidx].0,
+                self.nr_layer_cpus_min_max[lidx].1,
+                format_bitvec(&layer.cpus),
+                width = header_width
+            );
+            self.nr_layer_cpus_min_max[lidx] = (layer.nr_cpus, layer.nr_cpus);
+        }
+
+        self.processing_dur += Instant::now().duration_since(started_at);
+        Ok(())
+    }
+
+    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
+        let now = Instant::now();
+        let mut next_sched_at = now + self.sched_intv;
+        let mut next_monitor_at = now + self.monitor_intv;
+
+        while !shutdown.load(Ordering::Relaxed) && !UserExitInfo::exited(&self.skel.bss().uei)? {
+            let now = Instant::now();
+
+            if now >= next_sched_at {
+                self.step()?;
+                while next_sched_at < now {
+                    next_sched_at += self.sched_intv;
+                }
+            }
+
+            if now >= next_monitor_at {
+                self.report()?;
+                while next_monitor_at < now {
+                    next_monitor_at += self.monitor_intv;
+                }
+            }
+
+            std::thread::sleep(
+                next_sched_at
+                    .min(next_monitor_at)
+                    .duration_since(Instant::now()),
+            );
+        }
+
+        self.struct_ops.take();
+        UserExitInfo::read(&self.skel.bss().uei)?.report()
+    }
+}
+
+impl<'a> Drop for Scheduler<'a> {
+    fn drop(&mut self) {
+        if let Some(struct_ops) = self.struct_ops.take() {
+            drop(struct_ops);
+        }
+    }
+}
+
+fn write_example_file(path: &str) -> Result<()> {
+    let example = LayerConfig {
+        specs: vec![
+            LayerSpec {
+                name: "batch".into(),
+                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
+                matches: vec![
+                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
+                    vec![LayerMatch::NiceAbove(0)],
+                ],
+                kind: LayerKind::Confined {
+                    cpus_range: Some((0, 16)),
+                    util_range: (0.8, 0.9),
+                },
+            },
+            LayerSpec {
+                name: "immediate".into(),
+                comment: Some("tasks under workload.slice with nice value < 0".into()),
+                matches: vec![vec![
+                    LayerMatch::CgroupPrefix("workload.slice/".into()),
+                    LayerMatch::NiceBelow(0),
+                ]],
+                kind: LayerKind::Open { preempt: true },
+            },
+            LayerSpec {
+                name: "normal".into(),
+                comment: Some("the rest".into()),
+                matches: vec![vec![]],
+                kind: LayerKind::Grouped {
+                    cpus_range: None,
+                    util_range: (0.5, 0.6),
+                    preempt: false,
+                },
+            },
+        ],
+    };
+
+    let mut f = fs::OpenOptions::new()
+        .create_new(true)
+        .write(true)
+        .open(path)?;
+    Ok(f.write_all(serde_json::to_string_pretty(&example)?.as_bytes())?)
+}
+
+fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
+    let nr_specs = specs.len();
+    if nr_specs == 0 {
+        bail!("No layer spec");
+    }
+    if nr_specs > MAX_LAYERS {
+        bail!("Too many layer specs");
+    }
+
+    for (idx, spec) in specs.iter().enumerate() {
+        if idx < nr_specs - 1 {
+            if spec.matches.len() == 0 {
+                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
+            }
+        } else {
+            if spec.matches.len() != 1 || spec.matches[0].len() != 0 {
+                bail!("Terminal spec {:?} must have an empty match", spec.name);
+            }
+        }
+
+        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
+            bail!(
+                "Spec {:?} has too many ({}) OR match blocks",
+                spec.name,
+                spec.matches.len()
+            );
+        }
+
+        for (ands_idx, ands) in spec.matches.iter().enumerate() {
+            if ands.len() > NR_LAYER_MATCH_KINDS {
+                bail!(
+                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
+                    spec.name,
+                    ands_idx,
+                    ands.len()
+                );
+            }
+            for one in ands.iter() {
+                match one {
+                    LayerMatch::CgroupPrefix(prefix) => {
+                        if prefix.len() > MAX_PATH {
+                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
+                        }
+                    }
+                    LayerMatch::CommPrefix(prefix) => {
+                        if prefix.len() > MAX_COMM {
+                            bail!("Spec {:?} has too long a comm prefix", spec.name);
+                        }
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        match spec.kind {
+            LayerKind::Confined {
+                cpus_range,
+                util_range,
+            }
+            | LayerKind::Grouped {
+                cpus_range,
+                util_range,
+                ..
+            } => {
+                if let Some((cpus_min, cpus_max)) = cpus_range {
+                    if cpus_min > cpus_max {
+                        bail!(
+                            "Spec {:?} has invalid cpus_range({}, {})",
+                            spec.name,
+                            cpus_min,
+                            cpus_max
+                        );
+                    }
+                }
+                if util_range.0 >= util_range.1 {
+                    bail!(
+                        "Spec {:?} has invalid util_range ({}, {})",
+                        spec.name,
+                        util_range.0,
+                        util_range.1
+                    );
+                }
+            }
+            _ => {}
+        }
+    }
+
+    Ok(())
+}
+
+fn main() -> Result<()> {
+    let opts = Opts::parse();
+
+    let llv = match opts.verbose {
+        0 => simplelog::LevelFilter::Info,
+        1 => simplelog::LevelFilter::Debug,
+        _ => simplelog::LevelFilter::Trace,
+    };
+    let mut lcfg = simplelog::ConfigBuilder::new();
+    lcfg.set_time_level(simplelog::LevelFilter::Error)
+        .set_location_level(simplelog::LevelFilter::Off)
+        .set_target_level(simplelog::LevelFilter::Off)
+        .set_thread_level(simplelog::LevelFilter::Off);
+    simplelog::TermLogger::init(
+        llv,
+        lcfg.build(),
+        simplelog::TerminalMode::Stderr,
+        simplelog::ColorChoice::Auto,
+    )?;
+
+    debug!("opts={:?}", &opts);
+
+    if let Some(path) = &opts.example {
+        write_example_file(path)?;
+        return Ok(());
+    }
+
+    let mut layer_config = LayerConfig { specs: vec![] };
+    for (idx, input) in opts.specs.iter().enumerate() {
+        layer_config.specs.append(
+            &mut LayerSpec::parse(input)
+                .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?,
+        );
+    }
+
+    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
+    verify_layer_specs(&layer_config.specs)?;
+
+    let mut sched = Scheduler::init(&opts, layer_config.specs)?;
+
+    let shutdown = Arc::new(AtomicBool::new(false));
+    let shutdown_clone = shutdown.clone();
+    ctrlc::set_handler(move || {
+        shutdown_clone.store(true, Ordering::Relaxed);
+    })
+    .context("Error setting Ctrl-C handler")?;
+
+    sched.run(shutdown)
+}

From c2f53c8ad132e2bbf478769b0a45b05bd9cd7b24 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 31 Oct 2023 15:31:21 -1000
Subject: [PATCH 125/304] scx_examples: Address the interaction between yield
 and slice based runtime calculation

Calculating runtime from the amount consumed from slice punishes
yield(2)ers. There's nothing fundamentally wrong with it but it doesn't
align well with how cfs does it and can have unexpected effects on
applications.

Note the caveat in the example schedulers and switch scx_rusty to use
timestamp based one.
---
 tools/sched_ext/scx_flatcg.bpf.c              | 10 +++++++++-
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 19 ++++++++++++++++---
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |  1 +
 tools/sched_ext/scx_simple.bpf.c              | 10 +++++++++-
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 6d8c6f396577a..2db3d8d45e683 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -510,7 +510,15 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
 	struct cgroup *cgrp;
 	struct fcg_cgrp_ctx *cgc;
 
-	/* scale the execution time by the inverse of the weight and charge */
+	/*
+	 * Scale the execution time by the inverse of the weight and charge.
+	 *
+	 * Note that the default yield implementation yields by setting
+	 * @p->scx.slice to zero and the following would treat the yielding task
+	 * as if it has consumed all its slice. If this penalizes yielding tasks
+	 * too much, determine the execution time by taking explicit timestamps
+	 * instead of depending on @p->scx.slice.
+	 */
 	if (!fifo_sched)
 		p->scx.dsq_vtime +=
 			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 436297e6dcac9..134bceb269b8e 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -681,8 +681,10 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 		scx_bpf_error("Failed to lookup task_ctx");
 		return;
 	}
-	dom_id = taskc->dom_id;
 
+	taskc->running_at = bpf_ktime_get_ns();
+
+	dom_id = taskc->dom_id;
 	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
 	if (!domc) {
 		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
@@ -701,11 +703,20 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 
 void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 {
+	struct task_ctx *taskc;
+	pid_t pid = p->pid;
+
 	if (fifo_sched)
 		return;
 
+	if (!(taskc = bpf_map_lookup_elem(&task_data, &pid))) {
+		scx_bpf_error("Failed to lookup task_ctx");
+		return;
+	}
+
 	/* scale the execution time by the inverse of the weight and charge */
-	p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
+	p->scx.dsq_vtime +=
+		(bpf_ktime_get_ns() - taskc->running_at) * 100 / p->scx.weight;
 }
 
 void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
@@ -792,7 +803,9 @@ void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
 
 	task_pick_and_set_domain(task_ctx, p, cpumask, false);
 	if (all_cpumask)
-		task_ctx->all_cpus = bpf_cpumask_subset(all_cpumask, cpumask);
+		task_ctx->all_cpus =
+			bpf_cpumask_subset((const struct cpumask *)all_cpumask,
+					   cpumask);
 }
 
 s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index 28eed277fd8af..10cfe63c6a9a7 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -49,6 +49,7 @@ struct task_ctx {
 	unsigned int dom_id;
 	unsigned int weight;
 	unsigned long long runnable_at;
+	unsigned long long running_at;
 	unsigned long long runnable_for;
 
 	/* The task is a workqueue worker thread */
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index d4528c7da4500..56b589d7f6630 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -101,7 +101,15 @@ void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
 	if (fifo_sched)
 		return;
 
-	/* scale the execution time by the inverse of the weight and charge */
+	/*
+	 * Scale the execution time by the inverse of the weight and charge.
+	 *
+	 * Note that the default yield implementation yields by setting
+	 * @p->scx.slice to zero and the following would treat the yielding task
+	 * as if it has consumed all its slice. If this penalizes yielding tasks
+	 * too much, determine the execution time by taking explicit timestamps
+	 * instead of depending on @p->scx.slice.
+	 */
 	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
 }
 

From e199c47a32485da2dd5a8894f928abc2fc950c2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 1 Nov 2023 10:19:57 -1000
Subject: [PATCH 126/304] scx_rusty: Introduce lookup_task_ctx() and
 consistently use @taskc as task_ctx var name

---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 136 ++++++++----------
 1 file changed, 62 insertions(+), 74 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 134bceb269b8e..458c139bfcfb7 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -131,6 +131,19 @@ struct {
 	__uint(map_flags, 0);
 } task_data SEC(".maps");
 
+struct task_ctx *lookup_task_ctx(struct task_struct *p)
+{
+	struct task_ctx *taskc;
+	s32 pid = p->pid;
+
+	if ((taskc = bpf_map_lookup_elem(&task_data, &pid))) {
+		return taskc;
+	} else {
+		scx_bpf_error("task_ctx lookup failed for pid %d", p->pid);
+		return NULL;
+	}
+}
+
 /*
  * This is populated from userspace to indicate which pids should be reassigned
  * to new doms.
@@ -275,16 +288,14 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
 	const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
-	struct task_ctx *task_ctx;
+	struct task_ctx *taskc;
 	struct bpf_cpumask *p_cpumask;
-	pid_t pid = p->pid;
 	bool prev_domestic, has_idle_cores;
 	s32 cpu;
 
 	refresh_tune_params();
 
-	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
-	    !(p_cpumask = task_ctx->cpumask))
+	if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask))
 		goto enoent;
 
 	if (kthreads_local &&
@@ -302,22 +313,21 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		struct task_struct *current = (void *)bpf_get_current_task();
 
 		if (!(BPF_CORE_READ(current, flags) & PF_EXITING) &&
-		    task_ctx->dom_id < MAX_DOMS) {
+		    taskc->dom_id < MAX_DOMS) {
 			struct dom_ctx *domc;
 			struct bpf_cpumask *d_cpumask;
 			const struct cpumask *idle_cpumask;
 			bool has_idle;
 
-			domc = bpf_map_lookup_elem(&dom_ctx, &task_ctx->dom_id);
+			domc = bpf_map_lookup_elem(&dom_ctx, &taskc->dom_id);
 			if (!domc) {
-				scx_bpf_error("Failed to find dom%u",
-					      task_ctx->dom_id);
+				scx_bpf_error("Failed to find dom%u", taskc->dom_id);
 				goto enoent;
 			}
 			d_cpumask = domc->cpumask;
 			if (!d_cpumask) {
 				scx_bpf_error("Failed to acquire dom%u cpumask kptr",
-					      task_ctx->dom_id);
+					      taskc->dom_id);
 				goto enoent;
 			}
 
@@ -418,7 +428,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	 * under-utilized, ignore domain boundaries and push the task there. Try
 	 * to find an idle core first.
 	 */
-	if (task_ctx->all_cpus && direct_greedy_cpumask &&
+	if (taskc->all_cpus && direct_greedy_cpumask &&
 	    !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
 		u32 dom_id = cpu_to_dom_id(prev_cpu);
 		struct dom_ctx *domc;
@@ -488,7 +498,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	return cpu;
 
 direct:
-	task_ctx->dispatch_local = true;
+	taskc->dispatch_local = true;
 	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 
@@ -499,15 +509,16 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	struct task_ctx *task_ctx;
+	struct task_ctx *taskc;
 	struct bpf_cpumask *p_cpumask;
 	pid_t pid = p->pid;
 	u32 *new_dom;
 	s32 cpu;
 
-	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid)) ||
-	    !(p_cpumask = task_ctx->cpumask)) {
-		scx_bpf_error("Failed to lookup task_ctx or cpumask");
+	if (!(taskc = lookup_task_ctx(p)))
+		return;
+	if (!(p_cpumask = taskc->cpumask)) {
+		scx_bpf_error("NULL cpmask");
 		return;
 	}
 
@@ -515,18 +526,18 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 	 * Migrate @p to a new domain if requested by userland through lb_data.
 	 */
 	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
-	if (new_dom && *new_dom != task_ctx->dom_id &&
-	    task_set_domain(task_ctx, p, *new_dom, false)) {
+	if (new_dom && *new_dom != taskc->dom_id &&
+	    task_set_domain(taskc, p, *new_dom, false)) {
 		stat_add(RUSTY_STAT_LOAD_BALANCE, 1);
-		task_ctx->dispatch_local = false;
+		taskc->dispatch_local = false;
 		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, 0);
 		goto dom_queue;
 	}
 
-	if (task_ctx->dispatch_local) {
-		task_ctx->dispatch_local = false;
+	if (taskc->dispatch_local) {
+		taskc->dispatch_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
@@ -547,11 +558,10 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 
 dom_queue:
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, task_ctx->dom_id, slice_ns,
-				 enq_flags);
+		scx_bpf_dispatch(p, taskc->dom_id, slice_ns, enq_flags);
 	} else {
 		u64 vtime = p->scx.dsq_vtime;
-		u32 dom_id = task_ctx->dom_id;
+		u32 dom_id = taskc->dom_id;
 		struct dom_ctx *domc;
 
 		domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
@@ -567,8 +577,7 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 		if (vtime_before(vtime, domc->vtime_now - slice_ns))
 			vtime = domc->vtime_now - slice_ns;
 
-		scx_bpf_dispatch_vtime(p, task_ctx->dom_id, slice_ns, vtime,
-				       enq_flags);
+		scx_bpf_dispatch_vtime(p, taskc->dom_id, slice_ns, vtime, enq_flags);
 	}
 
 	/*
@@ -586,7 +595,7 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 	 * CPUs are highly loaded while KICK_GREEDY doesn't. Even under fairly
 	 * high utilization, KICK_GREEDY can slightly improve work-conservation.
 	 */
-	if (task_ctx->all_cpus && kick_greedy_cpumask) {
+	if (taskc->all_cpus && kick_greedy_cpumask) {
 		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
 					    kick_greedy_cpumask, 0);
 		if (cpu >= 0) {
@@ -654,33 +663,26 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
 
 void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 {
-	struct task_ctx *task_ctx;
-	pid_t pid = p->pid;
+	struct task_ctx *taskc;
 
-	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
-		scx_bpf_error("Failed to lookup task_ctx");
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
-	}
 
-	task_ctx->runnable_at = bpf_ktime_get_ns();
-	task_ctx->is_kworker = p->flags & PF_WQ_WORKER;
+	taskc->runnable_at = bpf_ktime_get_ns();
+	taskc->is_kworker = p->flags & PF_WQ_WORKER;
 }
 
 void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 {
 	struct task_ctx *taskc;
 	struct dom_ctx *domc;
-	pid_t pid = p->pid;
 	u32 dom_id;
 
 	if (fifo_sched)
 		return;
 
-	taskc = bpf_map_lookup_elem(&task_data, &pid);
-	if (!taskc) {
-		scx_bpf_error("Failed to lookup task_ctx");
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
-	}
 
 	taskc->running_at = bpf_ktime_get_ns();
 
@@ -704,15 +706,12 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 {
 	struct task_ctx *taskc;
-	pid_t pid = p->pid;
 
 	if (fifo_sched)
 		return;
 
-	if (!(taskc = bpf_map_lookup_elem(&task_data, &pid))) {
-		scx_bpf_error("Failed to lookup task_ctx");
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
-	}
 
 	/* scale the execution time by the inverse of the weight and charge */
 	p->scx.dsq_vtime +=
@@ -721,32 +720,26 @@ void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 
 void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 {
-	struct task_ctx *task_ctx;
-	pid_t pid = p->pid;
+	struct task_ctx *taskc;
 
-	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
-		scx_bpf_error("Failed to lookup task_ctx");
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
-	}
 
-	task_ctx->runnable_for += bpf_ktime_get_ns() - task_ctx->runnable_at;
-	task_ctx->runnable_at = 0;
+	taskc->runnable_for += bpf_ktime_get_ns() - taskc->runnable_at;
+	taskc->runnable_at = 0;
 }
 
 void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
 {
-	struct task_ctx *task_ctx;
-	pid_t pid = p->pid;
+	struct task_ctx *taskc;
 
-	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
-		scx_bpf_error("Failed to lookup task_ctx");
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
-	}
 
-	task_ctx->weight = weight;
+	taskc->weight = weight;
 }
 
-static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
+static u32 task_pick_domain(struct task_ctx *taskc, struct task_struct *p,
 			    const struct cpumask *cpumask)
 {
 	s32 cpu = bpf_get_smp_processor_id();
@@ -755,13 +748,13 @@ static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
 	if (cpu < 0 || cpu >= MAX_CPUS)
 		return MAX_DOMS;
 
-	task_ctx->dom_mask = 0;
+	taskc->dom_mask = 0;
 
 	dom = pcpu_ctx[cpu].dom_rr_cur++;
 	bpf_repeat(nr_doms) {
 		dom = (dom + 1) % nr_doms;
 		if (cpumask_intersects_domain(cpumask, dom)) {
-			task_ctx->dom_mask |= 1LLU << dom;
+			taskc->dom_mask |= 1LLU << dom;
 			/*
 			 * AsThe starting point is round-robin'd and the first
 			 * match should be spread across all the domains.
@@ -774,7 +767,7 @@ static u32 task_pick_domain(struct task_ctx *task_ctx, struct task_struct *p,
 	return first_dom;
 }
 
-static void task_pick_and_set_domain(struct task_ctx *task_ctx,
+static void task_pick_and_set_domain(struct task_ctx *taskc,
 				     struct task_struct *p,
 				     const struct cpumask *cpumask,
 				     bool init_dsq_vtime)
@@ -782,9 +775,9 @@ static void task_pick_and_set_domain(struct task_ctx *task_ctx,
 	u32 dom_id = 0;
 
 	if (nr_doms > 1)
-		dom_id = task_pick_domain(task_ctx, p, cpumask);
+		dom_id = task_pick_domain(taskc, p, cpumask);
 
-	if (!task_set_domain(task_ctx, p, dom_id, init_dsq_vtime))
+	if (!task_set_domain(taskc, p, dom_id, init_dsq_vtime))
 		scx_bpf_error("Failed to set dom%d for %s[%d]",
 			      dom_id, p->comm, p->pid);
 }
@@ -792,34 +785,29 @@ static void task_pick_and_set_domain(struct task_ctx *task_ctx,
 void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
 		    const struct cpumask *cpumask)
 {
-	struct task_ctx *task_ctx;
-	pid_t pid = p->pid;
+	struct task_ctx *taskc;
 
-	if (!(task_ctx = bpf_map_lookup_elem(&task_data, &pid))) {
-		scx_bpf_error("Failed to lookup task_ctx for %s[%d]",
-			      p->comm, pid);
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
-	}
 
-	task_pick_and_set_domain(task_ctx, p, cpumask, false);
+	task_pick_and_set_domain(taskc, p, cpumask, false);
 	if (all_cpumask)
-		task_ctx->all_cpus =
-			bpf_cpumask_subset((const struct cpumask *)all_cpumask,
-					   cpumask);
+		taskc->all_cpus =
+			bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
 s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
 	struct bpf_cpumask *cpumask;
-	struct task_ctx task_ctx, *map_value;
+	struct task_ctx taskc, *map_value;
 	long ret;
 	pid_t pid;
 
-	memset(&task_ctx, 0, sizeof(task_ctx));
+	memset(&taskc, 0, sizeof(taskc));
 
 	pid = p->pid;
-	ret = bpf_map_update_elem(&task_data, &pid, &task_ctx, BPF_NOEXIST);
+	ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST);
 	if (ret) {
 		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return ret;

From 1b268b0e4e7404f649e6712d5630d21adf9526ba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 1 Nov 2023 12:06:55 -1000
Subject: [PATCH 127/304] scx_rusty: Use u64 and friends consistently and move
 dom_ctx def to .h

---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 32 ++++++++-----------
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     | 22 +++++++++----
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 458c139bfcfb7..5d3af55691913 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -55,18 +55,18 @@ char _license[] SEC("license") = "GPL";
 /*
  * Domains and cpus
  */
-const volatile __u32 nr_doms = 32;	/* !0 for veristat, set during init */
-const volatile __u32 nr_cpus = 64;	/* !0 for veristat, set during init */
-const volatile __u32 cpu_dom_id_map[MAX_CPUS];
-const volatile __u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
+const volatile u32 nr_doms = 32;	/* !0 for veristat, set during init */
+const volatile u32 nr_cpus = 64;	/* !0 for veristat, set during init */
+const volatile u32 cpu_dom_id_map[MAX_CPUS];
+const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
 
 const volatile bool kthreads_local;
 const volatile bool fifo_sched;
 const volatile bool switch_partial;
-const volatile __u32 greedy_threshold;
+const volatile u32 greedy_threshold;
 
 /* base slice duration */
-const volatile __u64 slice_ns = SCX_SLICE_DFL;
+const volatile u64 slice_ns = SCX_SLICE_DFL;
 
 /*
  * Exit info
@@ -78,10 +78,10 @@ char exit_msg[SCX_EXIT_MSG_LEN];
  * Per-CPU context
  */
 struct pcpu_ctx {
-	__u32 dom_rr_cur; /* used when scanning other doms */
+	u32 dom_rr_cur; /* used when scanning other doms */
 
 	/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
-	__u8 _padding[CACHELINE_SIZE - sizeof(u32)];
+	u8 _padding[CACHELINE_SIZE - sizeof(u32)];
 } __attribute__((aligned(CACHELINE_SIZE)));
 
 struct pcpu_ctx pcpu_ctx[MAX_CPUS];
@@ -89,12 +89,6 @@ struct pcpu_ctx pcpu_ctx[MAX_CPUS];
 /*
  * Domain context
  */
-struct dom_ctx {
-	struct bpf_cpumask __kptr *cpumask;
-	struct bpf_cpumask __kptr *direct_greedy_cpumask;
-	u64 vtime_now;
-};
-
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
@@ -162,12 +156,12 @@ struct {
  * that can be used directly in the scheduling paths.
  */
 struct tune_input{
-	__u64 gen;
-	__u64 direct_greedy_cpumask[MAX_CPUS / 64];
-	__u64 kick_greedy_cpumask[MAX_CPUS / 64];
+	u64 gen;
+	u64 direct_greedy_cpumask[MAX_CPUS / 64];
+	u64 kick_greedy_cpumask[MAX_CPUS / 64];
 } tune_input;
 
-__u64 tune_params_gen;
+u64 tune_params_gen;
 private(A) struct bpf_cpumask __kptr *all_cpumask;
 private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask;
 private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask;
@@ -884,7 +878,7 @@ static s32 create_dom(u32 dom_id)
 	}
 
 	for (cpu = 0; cpu < MAX_CPUS; cpu++) {
-		const volatile __u64 *dmask;
+		const volatile u64 *dmask;
 
 		dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]);
 		if (!dmask) {
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index 10cfe63c6a9a7..5a48c78fe9174 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -13,6 +13,10 @@
 #define __kptr
 #endif
 
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
 #define	MAX_CPUS 512
 #define	MAX_DOMS 64 /* limited to avoid complex bitmask ops */
 #define	CACHELINE_SIZE 64
@@ -43,14 +47,14 @@ enum stat_idx {
 
 struct task_ctx {
 	/* The domains this task can run on */
-	unsigned long long dom_mask;
+	u64 dom_mask;
 
 	struct bpf_cpumask __kptr *cpumask;
-	unsigned int dom_id;
-	unsigned int weight;
-	unsigned long long runnable_at;
-	unsigned long long running_at;
-	unsigned long long runnable_for;
+	u32 dom_id;
+	u32 weight;
+	u64 runnable_at;
+	u64 running_at;
+	u64 runnable_for;
 
 	/* The task is a workqueue worker thread */
 	bool is_kworker;
@@ -62,4 +66,10 @@ struct task_ctx {
 	bool dispatch_local;
 };
 
+struct dom_ctx {
+	struct bpf_cpumask __kptr *cpumask;
+	struct bpf_cpumask __kptr *direct_greedy_cpumask;
+	u64 vtime_now;
+};
+
 #endif /* __RUSTY_H */

From 53f76a92e4e6c5dcbdd39e6672897562976b0563 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 1 Nov 2023 14:52:53 -1000
Subject: [PATCH 128/304] scx_rusty: ravg in progress, load transfer incomplete

---
 tools/sched_ext/scx_ravg.bpf.h                |  42 +++
 tools/sched_ext/scx_ravg_impl.bpf.h           | 292 ++++++++++++++++++
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c |  64 +++-
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |  10 +-
 4 files changed, 399 insertions(+), 9 deletions(-)
 create mode 100644 tools/sched_ext/scx_ravg.bpf.h
 create mode 100644 tools/sched_ext/scx_ravg_impl.bpf.h

diff --git a/tools/sched_ext/scx_ravg.bpf.h b/tools/sched_ext/scx_ravg.bpf.h
new file mode 100644
index 0000000000000..a233d85d05aa6
--- /dev/null
+++ b/tools/sched_ext/scx_ravg.bpf.h
@@ -0,0 +1,42 @@
+#ifndef __SCX_RAVG_BPF_H__
+#define __SCX_RAVG_BPF_H__
+
+/*
+ * Running average helpers to be used in BPF progs. Assumes vmlinux.h has
+ * already been included.
+ */
+enum ravg_consts {
+	RAVG_VAL_BITS		= 44,		/* input values are 44bit */
+	RAVG_FRAC_BITS		= 20,		/* 1048576 is 1.0 */
+};
+
+/*
+ * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in
+ * arbitrary time intervals. The accumulated values are halved every half_life
+ * with each period starting when the current time % half_life is 0. Zeroing is
+ * enough for initialization.
+ *
+ * See ravg_accumulate() and ravg_read() for more details.
+ */
+struct ravg_data {
+	/* current value */
+	u64			val;
+
+	/*
+	 * The timestamp of @val. The latest completed seq #:
+	 *
+	 *   (val_at / half_life) - 1
+	 */
+	u64			val_at;
+
+	/* running avg as of the latest completed seq  */
+	u64			old;
+
+	/*
+	 * Accumulated value of the current period. Input value is 48bits and we
+	 * normalize half-life to 16bit, so it should fit in an u64.
+	 */
+	u64			cur;
+};
+
+#endif /* __SCX_RAVG_BPF_H__ */
diff --git a/tools/sched_ext/scx_ravg_impl.bpf.h b/tools/sched_ext/scx_ravg_impl.bpf.h
new file mode 100644
index 0000000000000..245b0671e4386
--- /dev/null
+++ b/tools/sched_ext/scx_ravg_impl.bpf.h
@@ -0,0 +1,292 @@
+/* to be included in the main bpf.c file */
+#include "scx_ravg.bpf.h"
+
+#define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
+//#define RAVG_FN_ATTRS		__attribute__((unused))
+
+static RAVG_FN_ATTRS void ravg_add(__u64 *sum, __u64 addend)
+{
+	__u64 new = *sum + addend;
+
+	if (new >= *sum)
+		*sum = new;
+	else
+		*sum = -1;
+}
+
+static RAVG_FN_ATTRS __u64 ravg_decay(__u64 v, __u32 shift)
+{
+	if (shift >= 64)
+		return 0;
+	else
+		return v >> shift;
+}
+
+static RAVG_FN_ATTRS __u32 ravg_normalize_dur(__u32 dur, __u32 half_life)
+{
+	if (dur < half_life)
+		return (((__u64)dur << RAVG_FRAC_BITS) + half_life - 1) /
+			half_life;
+	else
+		return 1 << RAVG_FRAC_BITS;
+}
+
+/*
+ * Pre-computed decayed full-period values. This is quicker and keeps the bpf
+ * verifier happy by removing the need for looping.
+ *
+ * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1)
+ * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2)
+ * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3)
+ * ...
+ */
+static __u64 ravg_full_sum[] = {
+	 524288,  786432,  917504,  983040,
+	1015808, 1032192, 1040384, 1044480,
+	1046528, 1047552, 1048064, 1048320,
+	1048448, 1048512, 1048544, 1048560,
+	1048568, 1048572, 1048574, 1048575,
+	/* the same from here on */
+};
+
+static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]);
+
+/**
+ * ravg_accumulate - Accumulate a new value
+ * @rd: ravg_data to accumulate into
+ * @new_val: new value
+ * @now: current timestamp
+ * @half_life: decay period, must be the same across calls
+ *
+ * The current value is changing to @val at @now. Accumulate accordingly.
+ */
+static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
+					  __u64 new_val, __u64 now,
+					  __u32 half_life)
+{
+	__u32 cur_seq, val_seq, seq_delta;
+
+	/*
+	 * It may be difficult for the caller to guarantee monotonic progress if
+	 * multiple CPUs accumulate to the same ravg_data. Handle @now being in
+	 * the past of @rd->val_at.
+	 */
+	if (now < rd->val_at)
+		now = rd->val_at;
+
+	cur_seq = now / half_life;
+	val_seq = rd->val_at / half_life;
+	seq_delta = cur_seq - val_seq;
+
+	/*
+	 * Decay ->old and fold ->cur into it.
+	 *
+	 *                                                          @end
+	 *                                                            v
+	 * timeline     |---------|---------|---------|---------|---------|
+	 * seq delta         4         3         2         1          0
+	 * seq            ->seq                                    cur_seq
+	 * val            ->old     ->cur                  ^
+	 *                   |         |                   |
+	 *                   \---------+------------------/
+	 */
+	if (seq_delta > 0) {
+		/* decay ->old to bring it upto the cur_seq - 1 */
+		rd->old = ravg_decay(rd->old, seq_delta);
+		/* non-zero ->cur must be from val_seq, calc and fold */
+		ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta));
+		/* clear */
+		rd->cur = 0;
+	}
+
+	if (!rd->val)
+		goto out;
+
+	/*
+	 * Accumulate @rd->val between @rd->val_at and @now.
+	 *
+	 *                       @rd->val_at                        @now
+	 *                            v                               v
+	 * timeline     |---------|---------|---------|---------|---------|
+	 * seq delta                  [  3  |    2    |    1    |  0  ]
+	 */
+	if (seq_delta > 0) {
+		__u32 dur;
+
+		/* fold the oldest period which may be partial */
+		dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life);
+		ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta));
+
+		/* fold the full periods in the middle with precomputed vals */
+		if (seq_delta > 1) {
+			__u32 idx = seq_delta - 2;
+
+			if (idx < ravg_full_sum_len)
+				ravg_add(&rd->old, rd->val *
+					 ravg_full_sum[idx]);
+			else
+				ravg_add(&rd->old, rd->val *
+					 ravg_full_sum[ravg_full_sum_len - 2]);
+		}
+
+		/* accumulate the current period duration into ->runtime */
+		rd->cur += rd->val * ravg_normalize_dur(now % half_life,
+							half_life);
+	} else {
+		rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at,
+							half_life);
+	}
+out:
+	if (new_val >= 1LLU << RAVG_VAL_BITS)
+		rd->val = (1LLU << RAVG_VAL_BITS) - 1;
+	else
+		rd->val = new_val;
+	rd->val_at = now;
+}
+
+/**
+ * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift)
+ * @a: multiplicand
+ * @b: multiplier
+ * @rshift: number of bits to shift right
+ *
+ * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is
+ * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must
+ * ensure that the final shifted result fits in u64.
+ */
+static __u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
+{
+	const __u64 mask32 = (__u32)-1;
+	__u64 al = a & mask32;
+	__u64 ah = (a & (mask32 << 32)) >> 32;
+
+	/*
+	 *                                        ah: high 32     al: low 32
+	 * a                                   |--------------||--------------|
+	 *
+	 * ah * b              |--------------||--------------|
+	 * al * b                              |--------------||--------------|
+	 */
+	al *= b;
+	ah *= b;
+
+	/*
+	 * (ah * b) >> rshift        |--------------||--------------|
+	 * (al * b) >> rshift                        |--------------||--------|
+	 *                                                           <-------->
+	 *                                                           32 - rshift
+	 */
+	al >>= rshift;
+	if (rshift <= 32)
+		ah <<= 32 - rshift;
+	else
+		ah >>= rshift - 32;
+
+	return al + ah;
+}
+
+/**
+ * ravg_read - Read the current running avg
+ * @rd: ravg_data to read from
+ * @now: timestamp as of which to read the running avg
+ * @half_life: decay period, must match ravg_accumulate()'s
+ *
+ * Read running avg from @rd as of @now.
+ */
+static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
+				     __u64 half_life)
+{
+	struct ravg_data trd;
+	__u32 elapsed = now % half_life;
+
+	/*
+	 * Accumulate the ongoing period into a temporary copy. This allows
+	 * external readers to access up-to-date avg without strongly
+	 * synchronizing with the updater (we need to add a seq lock tho).
+	 */
+	trd = *rd;
+	rd = &trd;
+	ravg_accumulate(rd, 0, now, half_life);
+
+	/*
+	 * At the beginning of a new half_life period, the running avg is the
+	 * same as @rd->old. At the beginning of the next, it'd be old load / 2
+	 * + current load / 2. Inbetween, we blend the two linearly.
+	 */
+	if (elapsed) {
+		__u32 progress = ravg_normalize_dur(elapsed, half_life);
+		/*
+		 * `H` is the duration of the half-life window, and `E` is how
+		 * much time has elapsed in this window. `P` is [0.0, 1.0]
+		 * representing how much the current window has progressed:
+		 *
+		 *   P = E / H
+		 *
+		 * If `old` is @rd->old, we would want to calculate the
+		 * following for blending:
+		 *
+		 *   old * (1.0 - P / 2)
+		 *
+		 * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply
+		 * and then divide by 1 << RAVG_FRAC_BITS:
+		 *
+		 *         (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2
+		 *   old * -----------------------------------------------------
+		 *                       1 << RAVG_FRAC_BITS
+		 *
+		 * As @progress is (1 << RAVG_FRAC_BITS) * P:
+		 *
+		 *         (1 << RAVG_FRAC_BITS) - progress / 2
+		 *   old * ------------------------------------
+		 *                1 << RAVG_FRAC_BITS
+		 *
+		 * As @rd->old uses full 64bit, the multiplication can overflow,
+		 * but we also know that the final result is gonna be smaller
+		 * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle
+		 * the interim multiplication correctly.
+		 */
+		__u64 old = u64_x_u32_rshift(rd->old,
+					(1 << RAVG_FRAC_BITS) - progress / 2,
+					RAVG_FRAC_BITS);
+		/*
+		 * If `S` is the Sum(val * duration) for this half-life window,
+		 * the avg for this window is:
+		 *
+		 *   S / E
+		 *
+		 * We would want to calculate the following for blending:
+		 *
+		 *   S / E * (P / 2)
+		 *
+		 * As P = E / H,
+		 *
+		 *   S / E * (E / H / 2)
+		 *   S / H / 2
+		 *
+		 * Expanding S, the above becomes:
+		 *
+		 *   Sum(val * duration) / H / 2
+		 *   Sum(val * (duration / H)) / 2
+		 *
+		 * As we use RAVG_FRAC_BITS bits for fixed point arithmetic,
+		 * let's multiply the whole result accordingly:
+		 *
+		 *   (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS)
+		 *
+		 *             duration * (1 << RAVG_FRAC_BITS)
+		 *   Sum(val * --------------------------------) / 2
+		 *                            H
+		 *
+		 * The righthand multiplier inside Sum() is the normalized
+		 * duration returned from ravg_normalize_dur(), so, the whole
+		 * Sum term equals @rd->cur.
+		 *
+		 *  rd->cur / 2
+		 */
+		__u64 cur = rd->cur / 2;
+
+		return old + cur;
+	} else {
+		return rd->old;
+	}
+}
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 5d3af55691913..9cb5e8fd46200 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -36,6 +36,7 @@
  * load balance based on userspace populating the lb_data map.
  */
 #include "../../../scx_common.bpf.h"
+#include "../../../scx_ravg_impl.bpf.h"
 #include "rusty.h"
 
 #include <errno.h>
@@ -97,6 +98,41 @@ struct {
 	__uint(map_flags, 0);
 } dom_ctx SEC(".maps");
 
+struct dom_load {
+	struct bpf_spin_lock lock;
+	u64 load;
+	struct ravg_data ravg_data;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct dom_load);
+	__uint(max_entries, MAX_DOMS);
+	__uint(map_flags, 0);
+} dom_load SEC(".maps");
+
+const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
+
+static void adj_dom_load(u32 dom_id, s64 adj, u64 now)
+{
+	struct dom_load *load;
+
+	if (!(load = bpf_map_lookup_elem(&dom_load, &dom_id))) {
+		scx_bpf_error("no dom_load for dom %u", dom_id);
+		return;
+	}
+
+	bpf_spin_lock(&load->lock);
+	load->load += adj;
+	ravg_accumulate(&load->ravg_data, load->load, now, USAGE_HALF_LIFE);
+	bpf_spin_unlock(&load->lock);
+
+	if (adj < 0 && (s64)load->load < 0)
+		scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)",
+			      bpf_get_smp_processor_id(), dom_id, load->load, adj);
+}
+
 /*
  * Statistics
  */
@@ -225,12 +261,12 @@ static void refresh_tune_params(void)
 	}
 }
 
-static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
+static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 			    u32 new_dom_id, bool init_dsq_vtime)
 {
 	struct dom_ctx *old_domc, *new_domc;
 	struct bpf_cpumask *d_cpumask, *t_cpumask;
-	u32 old_dom_id = task_ctx->dom_id;
+	u32 old_dom_id = taskc->dom_id;
 	s64 vtime_delta;
 
 	old_domc = bpf_map_lookup_elem(&dom_ctx, &old_dom_id);
@@ -257,7 +293,7 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 		return false;
 	}
 
-	t_cpumask = task_ctx->cpumask;
+	t_cpumask = taskc->cpumask;
 	if (!t_cpumask) {
 		scx_bpf_error("Failed to look up task cpumask");
 		return false;
@@ -269,13 +305,21 @@ static bool task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
 	 */
 	if (bpf_cpumask_intersects((const struct cpumask *)d_cpumask,
 				   p->cpus_ptr)) {
+		u64 now = bpf_ktime_get_ns();
+
+		if (taskc->runnable)
+			adj_dom_load(taskc->dom_id, -(s64)p->scx.weight, now);
+
 		p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta;
-		task_ctx->dom_id = new_dom_id;
+		taskc->dom_id = new_dom_id;
 		bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask,
 				p->cpus_ptr);
+
+		if (taskc->runnable)
+			adj_dom_load(taskc->dom_id, p->scx.weight, now);
 	}
 
-	return task_ctx->dom_id == new_dom_id;
+	return taskc->dom_id == new_dom_id;
 }
 
 s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
@@ -657,13 +701,17 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
 
 void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 {
+	u64 now = bpf_ktime_get_ns();
 	struct task_ctx *taskc;
 
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
+	taskc->runnable = true;
 	taskc->runnable_at = bpf_ktime_get_ns();
 	taskc->is_kworker = p->flags & PF_WQ_WORKER;
+
+	adj_dom_load(taskc->dom_id, p->scx.weight, now);
 }
 
 void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
@@ -714,13 +762,17 @@ void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
 
 void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 {
+	u64 now = bpf_ktime_get_ns();
 	struct task_ctx *taskc;
 
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
-	taskc->runnable_for += bpf_ktime_get_ns() - taskc->runnable_at;
+	taskc->runnable = false;
+	taskc->runnable_for += now - taskc->runnable_at;
 	taskc->runnable_at = 0;
+
+	adj_dom_load(taskc->dom_id, -(s64)p->scx.weight, now);
 }
 
 void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index 5a48c78fe9174..d2efb06400d84 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -17,9 +17,12 @@ typedef unsigned char u8;
 typedef unsigned int u32;
 typedef unsigned long long u64;
 
-#define	MAX_CPUS 512
-#define	MAX_DOMS 64 /* limited to avoid complex bitmask ops */
-#define	CACHELINE_SIZE 64
+#include "../../../scx_ravg.bpf.h"
+
+#define	MAX_CPUS	512
+#define	MAX_DOMS	64		/* limited to avoid complex bitmask ops */
+#define	CACHELINE_SIZE	64
+#define USAGE_HALF_LIFE	1000000000	/* 1s */
 
 /* Statistics */
 enum stat_idx {
@@ -52,6 +55,7 @@ struct task_ctx {
 	struct bpf_cpumask __kptr *cpumask;
 	u32 dom_id;
 	u32 weight;
+	bool runnable;
 	u64 runnable_at;
 	u64 running_at;
 	u64 runnable_for;

From ed47cb27586d2d339403563d3aa3a999a7d30927 Mon Sep 17 00:00:00 2001
From: Yuran Pereira <yuran.pereira@hotmail.com>
Date: Sat, 28 Oct 2023 10:54:13 +0530
Subject: [PATCH 129/304] selftests/bpf: Convert CHECK macros to ASSERT_*
 macros in bpf_iter

As it was pointed out by Yonghong Song [1], in the bpf selftests the use
of the ASSERT_* series of macros is preferred over the CHECK macro.
This patch replaces all CHECK calls in bpf_iter with the appropriate
ASSERT_* macros.

[1] https://lore.kernel.org/lkml/0a142924-633c-44e6-9a92-2dc019656bf2@linux.dev

Suggested-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Yuran Pereira <yuran.pereira@hotmail.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/DB3PR10MB6835E9C8DFCA226DD6FEF914E8A3A@DB3PR10MB6835.EURPRD10.PROD.OUTLOOK.COM
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 79 ++++++++-----------
 1 file changed, 35 insertions(+), 44 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index e3498f607b49d..5e334d3d7ac23 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -34,8 +34,6 @@
 #include "bpf_iter_ksym.skel.h"
 #include "bpf_iter_sockmap.skel.h"
 
-static int duration;
-
 static void test_btf_id_or_null(void)
 {
 	struct bpf_iter_test_kern3 *skel;
@@ -64,7 +62,7 @@ static void do_dummy_read_opts(struct bpf_program *prog, struct bpf_iter_attach_
 	/* not check contents, but ensure read() ends without error */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	CHECK(len < 0, "read", "read failed: %s\n", strerror(errno));
+	ASSERT_GE(len, 0, "read");
 
 	close(iter_fd);
 
@@ -413,7 +411,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
 		goto free_link;
 	}
 
-	if (CHECK(err < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(err, 0, "read"))
 		goto free_link;
 
 	ASSERT_HAS_SUBSTR(taskbuf, "(struct task_struct)",
@@ -526,11 +524,11 @@ static int do_read_with_fd(int iter_fd, const char *expected,
 	start = 0;
 	while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) {
 		start += len;
-		if (CHECK(start >= 16, "read", "read len %d\n", len))
+		if (!ASSERT_LT(start, 16, "read"))
 			return -1;
 		read_buf_len = read_one_char ? 1 : 16 - start;
 	}
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		return -1;
 
 	if (!ASSERT_STREQ(buf, expected, "read"))
@@ -571,8 +569,7 @@ static int do_read(const char *path, const char *expected)
 	int err, iter_fd;
 
 	iter_fd = open(path, O_RDONLY);
-	if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n",
-		  path, strerror(errno)))
+	if (!ASSERT_GE(iter_fd, 0, "open"))
 		return -1;
 
 	err = do_read_with_fd(iter_fd, expected, false);
@@ -600,7 +597,7 @@ static void test_file_iter(void)
 	unlink(path);
 
 	err = bpf_link__pin(link, path);
-	if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err))
+	if (!ASSERT_OK(err, "pin_iter"))
 		goto free_link;
 
 	err = do_read(path, "abcd");
@@ -651,12 +648,10 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 	 * overflow and needs restart.
 	 */
 	map1_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL);
-	if (CHECK(map1_fd < 0, "bpf_map_create",
-		  "map_creation failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(map1_fd, 0, "bpf_map_create"))
 		goto out;
 	map2_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL);
-	if (CHECK(map2_fd < 0, "bpf_map_create",
-		  "map_creation failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(map2_fd, 0, "bpf_map_create"))
 		goto free_map1;
 
 	/* bpf_seq_printf kernel buffer is 8 pages, so one map
@@ -685,14 +680,12 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 	/* setup filtering map_id in bpf program */
 	map_info_len = sizeof(map_info);
 	err = bpf_map_get_info_by_fd(map1_fd, &map_info, &map_info_len);
-	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
-		  strerror(errno)))
+	if (!ASSERT_OK(err, "get_map_info"))
 		goto free_map2;
 	skel->bss->map1_id = map_info.id;
 
 	err = bpf_map_get_info_by_fd(map2_fd, &map_info, &map_info_len);
-	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
-		  strerror(errno)))
+	if (!ASSERT_OK(err, "get_map_info"))
 		goto free_map2;
 	skel->bss->map2_id = map_info.id;
 
@@ -714,16 +707,14 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 		while ((len = read(iter_fd, buf, expected_read_len)) > 0)
 			total_read_len += len;
 
-		CHECK(len != -1 || errno != E2BIG, "read",
-		      "expected ret -1, errno E2BIG, but get ret %d, error %s\n",
-			  len, strerror(errno));
+		ASSERT_EQ(len, -1, "read");
+		ASSERT_EQ(errno, E2BIG, "read");
 		goto free_buf;
 	} else if (!ret1) {
 		while ((len = read(iter_fd, buf, expected_read_len)) > 0)
 			total_read_len += len;
 
-		if (CHECK(len < 0, "read", "read failed: %s\n",
-			  strerror(errno)))
+		if (!ASSERT_GE(len, 0, "read"))
 			goto free_buf;
 	} else {
 		do {
@@ -732,8 +723,7 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 				total_read_len += len;
 		} while (len > 0 || len == -EAGAIN);
 
-		if (CHECK(len < 0, "read", "read failed: %s\n",
-			  strerror(errno)))
+		if (!ASSERT_GE(len, 0, "read"))
 			goto free_buf;
 	}
 
@@ -836,7 +826,7 @@ static void test_bpf_hash_map(void)
 	/* do some tests */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */
@@ -917,7 +907,7 @@ static void test_bpf_percpu_hash_map(void)
 	/* do some tests */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */
@@ -983,17 +973,14 @@ static void test_bpf_array_map(void)
 	start = 0;
 	while ((len = read(iter_fd, buf + start, sizeof(buf) - start)) > 0)
 		start += len;
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */
 	res_first_key = *(__u32 *)buf;
 	res_first_val = *(__u64 *)(buf + sizeof(__u32));
-	if (CHECK(res_first_key != 0 || res_first_val != first_val,
-		  "bpf_seq_write",
-		  "seq_write failure: first key %u vs expected 0, "
-		  " first value %llu vs expected %llu\n",
-		  res_first_key, res_first_val, first_val))
+	if (!ASSERT_EQ(res_first_key, 0, "bpf_seq_write") ||
+			!ASSERT_EQ(res_first_val, first_val, "bpf_seq_write"))
 		goto close_iter;
 
 	if (!ASSERT_EQ(skel->bss->key_sum, expected_key, "key_sum"))
@@ -1092,7 +1079,7 @@ static void test_bpf_percpu_array_map(void)
 	/* do some tests */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */
@@ -1131,6 +1118,7 @@ static void test_bpf_sk_storage_delete(void)
 	sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
 	if (!ASSERT_GE(sock_fd, 0, "socket"))
 		goto out;
+
 	err = bpf_map_update_elem(map_fd, &sock_fd, &val, BPF_NOEXIST);
 	if (!ASSERT_OK(err, "map_update"))
 		goto out;
@@ -1151,14 +1139,19 @@ static void test_bpf_sk_storage_delete(void)
 	/* do some tests */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
-	if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem",
-		  "map value wasn't deleted (err=%d, errno=%d)\n", err, errno))
-		goto close_iter;
+
+	 /* Note: The following assertions serve to ensure
+	  * the value was deleted. It does so by asserting
+	  * that bpf_map_lookup_elem has failed. This might
+	  * seem counterintuitive at first.
+	  */
+	ASSERT_ERR(err, "bpf_map_lookup_elem");
+	ASSERT_EQ(errno, ENOENT, "bpf_map_lookup_elem");
 
 close_iter:
 	close(iter_fd);
@@ -1203,17 +1196,15 @@ static void test_bpf_sk_storage_get(void)
 	do_dummy_read(skel->progs.fill_socket_owner);
 
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
-	if (CHECK(err || val != getpid(), "bpf_map_lookup_elem",
-	    "map value wasn't set correctly (expected %d, got %d, err=%d)\n",
-	    getpid(), val, err))
+	if (!ASSERT_OK(err, "bpf_map_lookup_elem") ||
+			!ASSERT_EQ(val, getpid(), "bpf_map_lookup_elem"))
 		goto close_socket;
 
 	do_dummy_read(skel->progs.negate_socket_local_storage);
 
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
-	CHECK(err || val != -getpid(), "bpf_map_lookup_elem",
-	      "map value wasn't set correctly (expected %d, got %d, err=%d)\n",
-	      -getpid(), val, err);
+	ASSERT_OK(err, "bpf_map_lookup_elem");
+	ASSERT_EQ(val, -getpid(), "bpf_map_lookup_elem");
 
 close_socket:
 	close(sock_fd);
@@ -1290,7 +1281,7 @@ static void test_bpf_sk_storage_map(void)
 	/* do some tests */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */

From cb3c6a58be50c65014296aa3455cae0fa1e82eac Mon Sep 17 00:00:00 2001
From: Yuran Pereira <yuran.pereira@hotmail.com>
Date: Sat, 28 Oct 2023 10:54:14 +0530
Subject: [PATCH 130/304] selftests/bpf: Add malloc failure checks in bpf_iter

Since some malloc calls in bpf_iter may at times fail,
this patch adds the appropriate fail checks, and ensures that
any previously allocated resource is appropriately destroyed
before returning the function.

Signed-off-by: Yuran Pereira <yuran.pereira@hotmail.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/DB3PR10MB6835F0ECA792265FA41FC39BE8A3A@DB3PR10MB6835.EURPRD10.PROD.OUTLOOK.COM
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 5e334d3d7ac23..4e02093c2cbef 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -698,7 +698,7 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 		goto free_link;
 
 	buf = malloc(expected_read_len);
-	if (!buf)
+	if (!ASSERT_OK_PTR(buf, "malloc"))
 		goto close_iter;
 
 	/* do read */
@@ -868,6 +868,8 @@ static void test_bpf_percpu_hash_map(void)
 
 	skel->rodata->num_cpus = bpf_num_possible_cpus();
 	val = malloc(8 * bpf_num_possible_cpus());
+	if (!ASSERT_OK_PTR(val, "malloc"))
+		goto out;
 
 	err = bpf_iter_bpf_percpu_hash_map__load(skel);
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_percpu_hash_map__load"))
@@ -1044,6 +1046,8 @@ static void test_bpf_percpu_array_map(void)
 
 	skel->rodata->num_cpus = bpf_num_possible_cpus();
 	val = malloc(8 * bpf_num_possible_cpus());
+	if (!ASSERT_OK_PTR(val, "malloc"))
+		goto out;
 
 	err = bpf_iter_bpf_percpu_array_map__load(skel);
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_percpu_array_map__load"))

From 3cda0779ded173572e7a23a5da174cdc69b5978b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:43 -0700
Subject: [PATCH 131/304] selftests/bpf: fix RELEASE=1 build for tc_opts

Compiler complains about malloc(). We also don't need to dynamically
allocate anything, so make the life easier by using statically sized
buffer.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tc_opts.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_opts.c b/tools/testing/selftests/bpf/prog_tests/tc_opts.c
index 51883ccb80206..196abf2234656 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_opts.c
@@ -2387,12 +2387,9 @@ static int generate_dummy_prog(void)
 	const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn);
 	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	const size_t log_buf_sz = 256;
-	char *log_buf;
+	char log_buf[log_buf_sz];
 	int fd = -1;
 
-	log_buf = malloc(log_buf_sz);
-	if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc"))
-		return fd;
 	opts.log_buf = log_buf;
 	opts.log_size = log_buf_sz;
 
@@ -2402,7 +2399,6 @@ static int generate_dummy_prog(void)
 			   prog_insns, prog_insn_cnt, &opts);
 	ASSERT_STREQ(log_buf, "", "log_0");
 	ASSERT_GE(fd, 0, "prog_fd");
-	free(log_buf);
 	return fd;
 }
 

From 7bcc07dcd835833e9ca38e4121653b6b6d4bb4a8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:44 -0700
Subject: [PATCH 132/304] selftests/bpf: satisfy compiler by having explicit
 return in btf test

Some compilers complain about get_pprint_mapv_size() not returning value
in some code paths. Fix with explicit return.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/btf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 92d51f377fe59..8fb4a04fbbc04 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -5265,6 +5265,7 @@ static size_t get_pprint_mapv_size(enum pprint_mapv_kind_t mapv_kind)
 #endif
 
 	assert(0);
+	return 0;
 }
 
 static void set_pprint_mapv(enum pprint_mapv_kind_t mapv_kind,

From 2e74aef782d38f73b65c56c8ff5973488daa91c4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:45 -0700
Subject: [PATCH 133/304] bpf: derive smin/smax from umin/max bounds

Add smin/smax derivation from appropriate umin/umax values. Previously the
logic was surprisingly asymmetric, trying to derive umin/umax from smin/smax
(if possible), but not trying to do the same in the other direction. A simple
addition to __reg64_deduce_bounds() fixes this.

Added also generic comment about u64/s64 ranges and their relationship.
Hopefully that helps readers to understand all the bounds deductions
a bit better.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 857d766945171..8a4cdd2787ecc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2358,6 +2358,77 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 
 static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 {
+	/* If u64 range forms a valid s64 range (due to matching sign bit),
+	 * try to learn from that. Let's do a bit of ASCII art to see when
+	 * this is happening. Let's take u64 range first:
+	 *
+	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
+	 * |-------------------------------|--------------------------------|
+	 *
+	 * Valid u64 range is formed when umin and umax are anywhere in the
+	 * range [0, U64_MAX], and umin <= umax. u64 case is simple and
+	 * straightforward. Let's see how s64 range maps onto the same range
+	 * of values, annotated below the line for comparison:
+	 *
+	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
+	 * |-------------------------------|--------------------------------|
+	 * 0                        S64_MAX S64_MIN                        -1
+	 *
+	 * So s64 values basically start in the middle and they are logically
+	 * contiguous to the right of it, wrapping around from -1 to 0, and
+	 * then finishing as S64_MAX (0x7fffffffffffffff) right before
+	 * S64_MIN. We can try drawing the continuity of u64 vs s64 values
+	 * more visually as mapped to sign-agnostic range of hex values.
+	 *
+	 *  u64 start                                               u64 end
+	 *  _______________________________________________________________
+	 * /                                                               \
+	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
+	 * |-------------------------------|--------------------------------|
+	 * 0                        S64_MAX S64_MIN                        -1
+	 *                                / \
+	 * >------------------------------   ------------------------------->
+	 * s64 continues...        s64 end   s64 start          s64 "midpoint"
+	 *
+	 * What this means is that, in general, we can't always derive
+	 * something new about u64 from any random s64 range, and vice versa.
+	 *
+	 * But we can do that in two particular cases. One is when entire
+	 * u64/s64 range is *entirely* contained within left half of the above
+	 * diagram or when it is *entirely* contained in the right half. I.e.:
+	 *
+	 * |-------------------------------|--------------------------------|
+	 *     ^                   ^            ^                 ^
+	 *     A                   B            C                 D
+	 *
+	 * [A, B] and [C, D] are contained entirely in their respective halves
+	 * and form valid contiguous ranges as both u64 and s64 values. [A, B]
+	 * will be non-negative both as u64 and s64 (and in fact it will be
+	 * identical ranges no matter the signedness). [C, D] treated as s64
+	 * will be a range of negative values, while in u64 it will be
+	 * non-negative range of values larger than 0x8000000000000000.
+	 *
+	 * Now, any other range here can't be represented in both u64 and s64
+	 * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
+	 * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
+	 * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
+	 * for example. Similarly, valid s64 range [D, A] (going from negative
+	 * to positive values), would be two separate [D, U64_MAX] and [0, A]
+	 * ranges as u64. Currently reg_state can't represent two segments per
+	 * numeric domain, so in such situations we can only derive maximal
+	 * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
+	 *
+	 * So we use these facts to derive umin/umax from smin/smax and vice
+	 * versa only if they stay within the same "half". This is equivalent
+	 * to checking sign bit: lower half will have sign bit as zero, upper
+	 * half have sign bit 1. Below in code we simplify this by just
+	 * casting umin/umax as smin/smax and checking if they form valid
+	 * range, and vice versa. Those are equivalent checks.
+	 */
+	if ((s64)reg->umin_value <= (s64)reg->umax_value) {
+		reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
+		reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
+	}
 	/* Learn sign from signed bounds.
 	 * If we cannot cross the sign boundary, then signed and unsigned bounds
 	 * are the same, so combine.  This works even in the negative case, e.g.

From f188765f23a58bebce48bd078effa95733fe37de Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:46 -0700
Subject: [PATCH 134/304] bpf: derive smin32/smax32 from umin32/umax32 bounds

All the logic that applies to u64 vs s64, equally applies for u32 vs s32
relationships (just taken in a smaller 32-bit numeric space). So do the
same deduction of smin32/smax32 from umin32/umax32, if we can.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8a4cdd2787ecc..b93818abe7fc9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2324,6 +2324,13 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
 /* Uses signed min/max values to inform unsigned, and vice-versa */
 static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 {
+	/* if u32 range forms a valid s32 range (due to matching sign bit),
+	 * try to learn from that
+	 */
+	if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
+		reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
+		reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
+	}
 	/* Learn sign from signed bounds.
 	 * If we cannot cross the sign boundary, then signed and unsigned bounds
 	 * are the same, so combine.  This works even in the negative case, e.g.

From f404ef3b42c8e6719f0039d83668c837e5daca57 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:47 -0700
Subject: [PATCH 135/304] bpf: derive subreg bounds from full bounds when upper
 32 bits are constant

Comments in code try to explain the idea behind why this is correct.
Please check the code and comments.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b93818abe7fc9..e48a6180627bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2324,6 +2324,51 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
 /* Uses signed min/max values to inform unsigned, and vice-versa */
 static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 {
+	/* If upper 32 bits of u64/s64 range don't change, we can use lower 32
+	 * bits to improve our u32/s32 boundaries.
+	 *
+	 * E.g., the case where we have upper 32 bits as zero ([10, 20] in
+	 * u64) is pretty trivial, it's obvious that in u32 we'll also have
+	 * [10, 20] range. But this property holds for any 64-bit range as
+	 * long as upper 32 bits in that entire range of values stay the same.
+	 *
+	 * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
+	 * in decimal) has the same upper 32 bits throughout all the values in
+	 * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
+	 * range.
+	 *
+	 * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
+	 * following the rules outlined below about u64/s64 correspondence
+	 * (which equally applies to u32 vs s32 correspondence). In general it
+	 * depends on actual hexadecimal values of 32-bit range. They can form
+	 * only valid u32, or only valid s32 ranges in some cases.
+	 *
+	 * So we use all these insights to derive bounds for subregisters here.
+	 */
+	if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
+		/* u64 to u32 casting preserves validity of low 32 bits as
+		 * a range, if upper 32 bits are the same
+		 */
+		reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
+		reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
+
+		if ((s32)reg->umin_value <= (s32)reg->umax_value) {
+			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+		}
+	}
+	if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
+		/* low 32 bits should form a proper u32 range */
+		if ((u32)reg->smin_value <= (u32)reg->smax_value) {
+			reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
+			reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
+		}
+		/* low 32 bits should form a proper s32 range */
+		if ((s32)reg->smin_value <= (s32)reg->smax_value) {
+			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+		}
+	}
 	/* if u32 range forms a valid s32 range (due to matching sign bit),
 	 * try to learn from that
 	 */

From 6533e0acff58b9f141c0c7dc93114535ac5a3985 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:48 -0700
Subject: [PATCH 136/304] bpf: add special smin32/smax32 derivation from 64-bit
 bounds

Add a special case where we can derive valid s32 bounds from umin/umax
or smin/smax by stitching together negative s32 subrange and
non-negative s32 subrange. That requires upper 32 bits to form a [N, N+1]
range in u32 domain (taking into account wrap around, so 0xffffffff
to 0x00000000 is a valid [N, N+1] range in this sense). See code comment
for concrete examples.

Eduard Zingerman also provided an alternative explanation ([0]) for more
mathematically inclined readers:

Suppose:
. there are numbers a, b, c
. 2**31 <= b < 2**32
. 0 <= c < 2**31
. umin = 2**32 * a + b
. umax = 2**32 * (a + 1) + c

The number of values in the range represented by [umin; umax] is:
. N = umax - umin + 1 = 2**32 + c - b + 1
. min(N) = 2**32 + 0 - (2**32-1) + 1 = 2, with b = 2**32-1, c = 0
. max(N) = 2**32 + (2**31 - 1) - 2**31 + 1 = 2**32, with b = 2**31, c = 2**31-1

Hence [(s32)b; (s32)c] forms a valid range.

  [0] https://lore.kernel.org/bpf/d7af631802f0cfae20df77fe70068702d24bbd31.camel@gmail.com/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e48a6180627bf..08888784cbc8d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2369,6 +2369,29 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
 		}
 	}
+	/* Special case where upper bits form a small sequence of two
+	 * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
+	 * 0x00000000 is also valid), while lower bits form a proper s32 range
+	 * going from negative numbers to positive numbers. E.g., let's say we
+	 * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
+	 * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
+	 * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
+	 * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
+	 * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
+	 * upper 32 bits. As a random example, s64 range
+	 * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
+	 * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
+	 */
+	if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
+	    (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
+		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+	}
+	if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
+	    (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
+		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+	}
 	/* if u32 range forms a valid s32 range (due to matching sign bit),
 	 * try to learn from that
 	 */

From 3d6940ddd9b56b3fc376ee39656f6fb1b4e1a981 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:49 -0700
Subject: [PATCH 137/304] bpf: improve deduction of 64-bit bounds from 32-bit
 bounds

Add a few interesting cases in which we can tighten 64-bit bounds based
on newly learnt information about 32-bit bounds. E.g., when full u64/s64
registers are used in BPF program, and then eventually compared as
u32/s32. The latter comparison doesn't change the value of full
register, but it does impose new restrictions on possible lower 32 bits
of such full registers. And we can use that to derive additional full
register bounds information.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 08888784cbc8d..d0d0a1a1b662b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2536,10 +2536,54 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 	}
 }
 
+static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+{
+	/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
+	 * values on both sides of 64-bit range in hope to have tigher range.
+	 * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
+	 * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
+	 * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
+	 * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
+	 * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
+	 * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
+	 * We just need to make sure that derived bounds we are intersecting
+	 * with are well-formed ranges in respecitve s64 or u64 domain, just
+	 * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
+	 */
+	__u64 new_umin, new_umax;
+	__s64 new_smin, new_smax;
+
+	/* u32 -> u64 tightening, it's always well-formed */
+	new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
+	new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
+	reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+	reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+	/* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
+	new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
+	new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
+	reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+	reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+
+	/* if s32 can be treated as valid u32 range, we can use it as well */
+	if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+		/* s32 -> u64 tightening */
+		new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+		new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+		reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+		reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+		/* s32 -> s64 tightening */
+		new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+		new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+		reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+		reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+	}
+}
+
 static void __reg_deduce_bounds(struct bpf_reg_state *reg)
 {
 	__reg32_deduce_bounds(reg);
 	__reg64_deduce_bounds(reg);
+	__reg_deduce_mixed_bounds(reg);
 }
 
 /* Attempts to improve var_off based on unsigned min/max information */

From 558c06e551a3e2fd166e80aefb6bbd51c83737d1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:50 -0700
Subject: [PATCH 138/304] bpf: try harder to deduce register bounds from
 different numeric domains

There are cases (caught by subsequent reg_bounds tests in selftests/bpf)
where performing one round of __reg_deduce_bounds() doesn't propagate
all the information from, say, s32 to u32 bounds and than from newly
learned u32 bounds back to u64 and s64. So perform __reg_deduce_bounds()
twice to make sure such derivations are propagated fully after
reg_bounds_sync().

One such example is test `(s64)[0xffffffff00000001; 0] (u64)<
0xffffffff00000000` from selftest patch from this patch set. It demonstrates an
intricate dance of u64 -> s64 -> u64 -> u32 bounds adjustments, which requires
two rounds of __reg_deduce_bounds(). Here are corresponding refinement log from
selftest, showing evolution of knowledge.

REFINING (FALSE R1) (u64)SRC=[0xffffffff00000000; U64_MAX] (u64)DST_OLD=[0; U64_MAX] (u64)DST_NEW=[0xffffffff00000000; U64_MAX]
REFINING (FALSE R1) (u64)SRC=[0xffffffff00000000; U64_MAX] (s64)DST_OLD=[0xffffffff00000001; 0] (s64)DST_NEW=[0xffffffff00000001; -1]
REFINING (FALSE R1) (s64)SRC=[0xffffffff00000001; -1] (u64)DST_OLD=[0xffffffff00000000; U64_MAX] (u64)DST_NEW=[0xffffffff00000001; U64_MAX]
REFINING (FALSE R1) (u64)SRC=[0xffffffff00000001; U64_MAX] (u32)DST_OLD=[0; U32_MAX] (u32)DST_NEW=[1; U32_MAX]

R1 initially has smin/smax set to [0xffffffff00000001; -1], while umin/umax is
unknown. After (u64)< comparison, in FALSE branch we gain knowledge that
umin/umax is [0xffffffff00000000; U64_MAX]. That causes smin/smax to learn that
zero can't happen and upper bound is -1. Then smin/smax is adjusted from
umin/umax improving lower bound from 0xffffffff00000000 to 0xffffffff00000001.
And then eventually umin32/umax32 bounds are drived from umin/umax and become
[1; U32_MAX].

Selftest in the last patch is actually implementing a multi-round fixed-point
convergence logic, but so far all the tests are handled by two rounds of
reg_bounds_sync() on the verifier state, so we keep it simple for now.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d0d0a1a1b662b..2991e9dd44755 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2605,6 +2605,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
 	__update_reg_bounds(reg);
 	/* We might have learned something about the sign bit. */
 	__reg_deduce_bounds(reg);
+	__reg_deduce_bounds(reg);
 	/* We might have learned some bits from the bounds. */
 	__reg_bound_offset(reg);
 	/* Intersecting with the old var_off might have improved our bounds

From b929d4979b2be8ab0e6f539bebadbb4e38dc5e90 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:51 -0700
Subject: [PATCH 139/304] bpf: drop knowledge-losing
 __reg_combine_{32,64}_into_{64,32} logic

When performing 32-bit conditional operation operating on lower 32 bits
of a full 64-bit register, register full value isn't changed. We just
potentially gain new knowledge about that register's lower 32 bits.

Unfortunately, __reg_combine_{32,64}_into_{64,32} logic that
reg_set_min_max() performs as a last step, can lose information in some
cases due to __mark_reg64_unbounded() and __reg_assign_32_into_64().
That's bad and completely unnecessary. Especially __reg_assign_32_into_64()
looks completely out of place here, because we are not performing
zero-extending subregister assignment during conditional jump.

So this patch replaced __reg_combine_* with just a normal
reg_bounds_sync() which will do a proper job of deriving u64/s64 bounds
from u32/s32, and vice versa (among all other combinations).

__reg_combine_64_into_32() is also used in one more place,
coerce_reg_to_size(), while handling 1- and 2-byte register loads.
Looking into this, it seems like besides marking subregister as
unbounded before performing reg_bounds_sync(), we were also performing
deduction of smin32/smax32 and umin32/umax32 bounds from respective
smin/smax and umin/umax bounds. It's now redundant as reg_bounds_sync()
performs all the same logic more generically (e.g., without unnecessary
assumption that upper 32 bits of full register should be zero).

Long story short, we remove __reg_combine_64_into_32() completely, and
coerce_reg_to_size() now only does resetting subreg to unbounded and then
performing reg_bounds_sync() to recover as much information as possible
from 64-bit umin/umax and smin/smax bounds, set explicitly in
coerce_reg_to_size() earlier.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 60 ++++++-------------------------------------
 1 file changed, 8 insertions(+), 52 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2991e9dd44755..8802172fe8c9e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2639,51 +2639,6 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
 	}
 }
 
-static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
-{
-	/* special case when 64-bit register has upper 32-bit register
-	 * zeroed. Typically happens after zext or <<32, >>32 sequence
-	 * allowing us to use 32-bit bounds directly,
-	 */
-	if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
-		__reg_assign_32_into_64(reg);
-	} else {
-		/* Otherwise the best we can do is push lower 32bit known and
-		 * unknown bits into register (var_off set from jmp logic)
-		 * then learn as much as possible from the 64-bit tnum
-		 * known and unknown bits. The previous smin/smax bounds are
-		 * invalid here because of jmp32 compare so mark them unknown
-		 * so they do not impact tnum bounds calculation.
-		 */
-		__mark_reg64_unbounded(reg);
-	}
-	reg_bounds_sync(reg);
-}
-
-static bool __reg64_bound_s32(s64 a)
-{
-	return a >= S32_MIN && a <= S32_MAX;
-}
-
-static bool __reg64_bound_u32(u64 a)
-{
-	return a >= U32_MIN && a <= U32_MAX;
-}
-
-static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
-{
-	__mark_reg32_unbounded(reg);
-	if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
-		reg->s32_min_value = (s32)reg->smin_value;
-		reg->s32_max_value = (s32)reg->smax_value;
-	}
-	if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
-		reg->u32_min_value = (u32)reg->umin_value;
-		reg->u32_max_value = (u32)reg->umax_value;
-	}
-	reg_bounds_sync(reg);
-}
-
 /* Mark a register as having a completely unknown (scalar) value. */
 static void __mark_reg_unknown(const struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg)
@@ -6380,9 +6335,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 	 * values are also truncated so we push 64-bit bounds into
 	 * 32-bit bounds. Above were truncated < 32-bits already.
 	 */
-	if (size >= 4)
-		return;
-	__reg_combine_64_into_32(reg);
+	if (size < 4) {
+		__mark_reg32_unbounded(reg);
+		reg_bounds_sync(reg);
+	}
 }
 
 static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
@@ -14621,13 +14577,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 					     tnum_subreg(false_32off));
 		true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
 					    tnum_subreg(true_32off));
-		__reg_combine_32_into_64(false_reg);
-		__reg_combine_32_into_64(true_reg);
+		reg_bounds_sync(false_reg);
+		reg_bounds_sync(true_reg);
 	} else {
 		false_reg->var_off = false_64off;
 		true_reg->var_off = true_64off;
-		__reg_combine_64_into_32(false_reg);
-		__reg_combine_64_into_32(true_reg);
+		reg_bounds_sync(false_reg);
+		reg_bounds_sync(true_reg);
 	}
 }
 

From cdeb5dab9238520c39c4701d551f5d66463c504b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:53 -0700
Subject: [PATCH 140/304] bpf: rename is_branch_taken reg arguments to prepare
 for the second one

Just taking mundane refactoring bits out into a separate patch. No
functional changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-12-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 108 +++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8802172fe8c9e..725f327ce5eb8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14167,26 +14167,26 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
-static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
+static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode)
 {
-	struct tnum subreg = tnum_subreg(reg->var_off);
+	struct tnum subreg = tnum_subreg(reg1->var_off);
 	s32 sval = (s32)val;
 
 	switch (opcode) {
 	case BPF_JEQ:
 		if (tnum_is_const(subreg))
 			return !!tnum_equals_const(subreg, val);
-		else if (val < reg->u32_min_value || val > reg->u32_max_value)
+		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
 			return 0;
-		else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
 			return 0;
 		break;
 	case BPF_JNE:
 		if (tnum_is_const(subreg))
 			return !tnum_equals_const(subreg, val);
-		else if (val < reg->u32_min_value || val > reg->u32_max_value)
+		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
 			return 1;
-		else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
 			return 1;
 		break;
 	case BPF_JSET:
@@ -14196,51 +14196,51 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 			return 0;
 		break;
 	case BPF_JGT:
-		if (reg->u32_min_value > val)
+		if (reg1->u32_min_value > val)
 			return 1;
-		else if (reg->u32_max_value <= val)
+		else if (reg1->u32_max_value <= val)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (reg->s32_min_value > sval)
+		if (reg1->s32_min_value > sval)
 			return 1;
-		else if (reg->s32_max_value <= sval)
+		else if (reg1->s32_max_value <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (reg->u32_max_value < val)
+		if (reg1->u32_max_value < val)
 			return 1;
-		else if (reg->u32_min_value >= val)
+		else if (reg1->u32_min_value >= val)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (reg->s32_max_value < sval)
+		if (reg1->s32_max_value < sval)
 			return 1;
-		else if (reg->s32_min_value >= sval)
+		else if (reg1->s32_min_value >= sval)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (reg->u32_min_value >= val)
+		if (reg1->u32_min_value >= val)
 			return 1;
-		else if (reg->u32_max_value < val)
+		else if (reg1->u32_max_value < val)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (reg->s32_min_value >= sval)
+		if (reg1->s32_min_value >= sval)
 			return 1;
-		else if (reg->s32_max_value < sval)
+		else if (reg1->s32_max_value < sval)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (reg->u32_max_value <= val)
+		if (reg1->u32_max_value <= val)
 			return 1;
-		else if (reg->u32_min_value > val)
+		else if (reg1->u32_min_value > val)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (reg->s32_max_value <= sval)
+		if (reg1->s32_max_value <= sval)
 			return 1;
-		else if (reg->s32_min_value > sval)
+		else if (reg1->s32_min_value > sval)
 			return 0;
 		break;
 	}
@@ -14249,79 +14249,79 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 }
 
 
-static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode)
 {
 	s64 sval = (s64)val;
 
 	switch (opcode) {
 	case BPF_JEQ:
-		if (tnum_is_const(reg->var_off))
-			return !!tnum_equals_const(reg->var_off, val);
-		else if (val < reg->umin_value || val > reg->umax_value)
+		if (tnum_is_const(reg1->var_off))
+			return !!tnum_equals_const(reg1->var_off, val);
+		else if (val < reg1->umin_value || val > reg1->umax_value)
 			return 0;
-		else if (sval < reg->smin_value || sval > reg->smax_value)
+		else if (sval < reg1->smin_value || sval > reg1->smax_value)
 			return 0;
 		break;
 	case BPF_JNE:
-		if (tnum_is_const(reg->var_off))
-			return !tnum_equals_const(reg->var_off, val);
-		else if (val < reg->umin_value || val > reg->umax_value)
+		if (tnum_is_const(reg1->var_off))
+			return !tnum_equals_const(reg1->var_off, val);
+		else if (val < reg1->umin_value || val > reg1->umax_value)
 			return 1;
-		else if (sval < reg->smin_value || sval > reg->smax_value)
+		else if (sval < reg1->smin_value || sval > reg1->smax_value)
 			return 1;
 		break;
 	case BPF_JSET:
-		if ((~reg->var_off.mask & reg->var_off.value) & val)
+		if ((~reg1->var_off.mask & reg1->var_off.value) & val)
 			return 1;
-		if (!((reg->var_off.mask | reg->var_off.value) & val))
+		if (!((reg1->var_off.mask | reg1->var_off.value) & val))
 			return 0;
 		break;
 	case BPF_JGT:
-		if (reg->umin_value > val)
+		if (reg1->umin_value > val)
 			return 1;
-		else if (reg->umax_value <= val)
+		else if (reg1->umax_value <= val)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (reg->smin_value > sval)
+		if (reg1->smin_value > sval)
 			return 1;
-		else if (reg->smax_value <= sval)
+		else if (reg1->smax_value <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (reg->umax_value < val)
+		if (reg1->umax_value < val)
 			return 1;
-		else if (reg->umin_value >= val)
+		else if (reg1->umin_value >= val)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (reg->smax_value < sval)
+		if (reg1->smax_value < sval)
 			return 1;
-		else if (reg->smin_value >= sval)
+		else if (reg1->smin_value >= sval)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (reg->umin_value >= val)
+		if (reg1->umin_value >= val)
 			return 1;
-		else if (reg->umax_value < val)
+		else if (reg1->umax_value < val)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (reg->smin_value >= sval)
+		if (reg1->smin_value >= sval)
 			return 1;
-		else if (reg->smax_value < sval)
+		else if (reg1->smax_value < sval)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (reg->umax_value <= val)
+		if (reg1->umax_value <= val)
 			return 1;
-		else if (reg->umin_value > val)
+		else if (reg1->umin_value > val)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (reg->smax_value <= sval)
+		if (reg1->smax_value <= sval)
 			return 1;
-		else if (reg->smin_value > sval)
+		else if (reg1->smin_value > sval)
 			return 0;
 		break;
 	}
@@ -14336,11 +14336,11 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
  * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
  *      range [0,10]
  */
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode,
 			   bool is_jmp32)
 {
-	if (__is_pointer_value(false, reg)) {
-		if (!reg_not_null(reg))
+	if (__is_pointer_value(false, reg1)) {
+		if (!reg_not_null(reg1))
 			return -1;
 
 		/* If pointer is valid tests against zero will fail so we can
@@ -14360,8 +14360,8 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
 	}
 
 	if (is_jmp32)
-		return is_branch32_taken(reg, val, opcode);
-	return is_branch64_taken(reg, val, opcode);
+		return is_branch32_taken(reg1, val, opcode);
+	return is_branch64_taken(reg1, val, opcode);
 }
 
 static int flip_opcode(u32 opcode)

From fc3615dd0ee9c2c9667a0d48e6f2376b50343c6e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:54 -0700
Subject: [PATCH 141/304] bpf: generalize is_branch_taken() to work with two
 registers

While still assuming that second register is a constant, generalize
is_branch_taken-related code to accept two registers instead of register
plus explicit constant value. This also, as a side effect, allows to
simplify check_cond_jmp_op() by unifying BPF_K case with BPF_X case, for
which we use a fake register to represent BPF_K's imm constant as
a register.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-13-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 57 ++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 725f327ce5eb8..5e722aaef7edb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14167,9 +14167,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
-static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode)
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_branch32_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
 {
 	struct tnum subreg = tnum_subreg(reg1->var_off);
+	u32 val = (u32)tnum_subreg(reg2->var_off).value;
 	s32 sval = (s32)val;
 
 	switch (opcode) {
@@ -14249,8 +14253,12 @@ static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode)
 }
 
 
-static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode)
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
 {
+	u64 val = reg2->var_off.value;
 	s64 sval = (s64)val;
 
 	switch (opcode) {
@@ -14329,16 +14337,23 @@ static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode)
 	return -1;
 }
 
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
  * and return:
  *  1 - branch will be taken and "goto target" will be executed
  *  0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
  *      range [0,10]
  */
-static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode,
-			   bool is_jmp32)
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+			   u8 opcode, bool is_jmp32)
 {
+	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+	u64 val;
+
+	if (!tnum_is_const(reg2_tnum))
+		return -1;
+	val = reg2_tnum.value;
+
 	if (__is_pointer_value(false, reg1)) {
 		if (!reg_not_null(reg1))
 			return -1;
@@ -14360,8 +14375,8 @@ static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode,
 	}
 
 	if (is_jmp32)
-		return is_branch32_taken(reg1, val, opcode);
-	return is_branch64_taken(reg1, val, opcode);
+		return is_branch32_taken(reg1, reg2, opcode);
+	return is_branch64_taken(reg1, reg2, opcode);
 }
 
 static int flip_opcode(u32 opcode)
@@ -14832,6 +14847,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
 	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
 	struct bpf_reg_state *eq_branch_regs;
+	struct bpf_reg_state fake_reg = {};
 	u8 opcode = BPF_OP(insn->code);
 	bool is_jmp32;
 	int pred = -1;
@@ -14872,36 +14888,27 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
 			return -EINVAL;
 		}
+		src_reg = &fake_reg;
+		src_reg->type = SCALAR_VALUE;
+		__mark_reg_known(src_reg, insn->imm);
 	}
 
 	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
 
 	if (BPF_SRC(insn->code) == BPF_K) {
-		pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
+		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	} else if (src_reg->type == SCALAR_VALUE &&
 		   is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
-		pred = is_branch_taken(dst_reg,
-				       tnum_subreg(src_reg->var_off).value,
-				       opcode,
-				       is_jmp32);
+		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	} else if (src_reg->type == SCALAR_VALUE &&
 		   !is_jmp32 && tnum_is_const(src_reg->var_off)) {
-		pred = is_branch_taken(dst_reg,
-				       src_reg->var_off.value,
-				       opcode,
-				       is_jmp32);
+		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	} else if (dst_reg->type == SCALAR_VALUE &&
 		   is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
-		pred = is_branch_taken(src_reg,
-				       tnum_subreg(dst_reg->var_off).value,
-				       flip_opcode(opcode),
-				       is_jmp32);
+		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
 	} else if (dst_reg->type == SCALAR_VALUE &&
 		   !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
-		pred = is_branch_taken(src_reg,
-				       dst_reg->var_off.value,
-				       flip_opcode(opcode),
-				       is_jmp32);
+		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
 	} else if (reg_is_pkt_pointer_any(dst_reg) &&
 		   reg_is_pkt_pointer_any(src_reg) &&
 		   !is_jmp32) {

From dd2a2cc3c1bfaa9dd755fafcb7812eb6d26ca2f9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:55 -0700
Subject: [PATCH 142/304] bpf: move is_branch_taken() down

Move is_branch_taken() slightly down. In subsequent patched we'll need
both flip_opcode() and is_pkt_ptr_branch_taken() for is_branch_taken(),
but instead of sprinkling forward declarations around, it makes more
sense to move is_branch_taken() lower below is_pkt_ptr_branch_taken(),
and also keep it closer to very tightly related reg_set_min_max(), as
they are two critical parts of the same SCALAR range tracking logic.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-14-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 84 +++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e722aaef7edb..c5d187d43fa1c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14337,48 +14337,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *r
 	return -1;
 }
 
-/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
- * and return:
- *  1 - branch will be taken and "goto target" will be executed
- *  0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
- *      range [0,10]
- */
-static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
-			   u8 opcode, bool is_jmp32)
-{
-	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
-	u64 val;
-
-	if (!tnum_is_const(reg2_tnum))
-		return -1;
-	val = reg2_tnum.value;
-
-	if (__is_pointer_value(false, reg1)) {
-		if (!reg_not_null(reg1))
-			return -1;
-
-		/* If pointer is valid tests against zero will fail so we can
-		 * use this to direct branch taken.
-		 */
-		if (val != 0)
-			return -1;
-
-		switch (opcode) {
-		case BPF_JEQ:
-			return 0;
-		case BPF_JNE:
-			return 1;
-		default:
-			return -1;
-		}
-	}
-
-	if (is_jmp32)
-		return is_branch32_taken(reg1, reg2, opcode);
-	return is_branch64_taken(reg1, reg2, opcode);
-}
-
 static int flip_opcode(u32 opcode)
 {
 	/* How can we transform "a <op> b" into "b <op> a"? */
@@ -14440,6 +14398,48 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
 	return -1;
 }
 
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
+ * and return:
+ *  1 - branch will be taken and "goto target" will be executed
+ *  0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
+ *      range [0,10]
+ */
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+			   u8 opcode, bool is_jmp32)
+{
+	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+	u64 val;
+
+	if (!tnum_is_const(reg2_tnum))
+		return -1;
+	val = reg2_tnum.value;
+
+	if (__is_pointer_value(false, reg1)) {
+		if (!reg_not_null(reg1))
+			return -1;
+
+		/* If pointer is valid tests against zero will fail so we can
+		 * use this to direct branch taken.
+		 */
+		if (val != 0)
+			return -1;
+
+		switch (opcode) {
+		case BPF_JEQ:
+			return 0;
+		case BPF_JNE:
+			return 1;
+		default:
+			return -1;
+		}
+	}
+
+	if (is_jmp32)
+		return is_branch32_taken(reg1, reg2, opcode);
+	return is_branch64_taken(reg1, reg2, opcode);
+}
+
 /* Adjusts the register min/max values in the case that the dst_reg is the
  * variable register that we are working on, and src_reg is a constant or we're
  * simply doing a BPF_K check.

From 171de12646d23b240fc73113acf1f62d85dd9d02 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:56 -0700
Subject: [PATCH 143/304] bpf: generalize is_branch_taken to handle all
 conditional jumps in one place

Make is_branch_taken() a single entry point for branch pruning decision
making, handling both pointer vs pointer, pointer vs scalar, and scalar
vs scalar cases in one place. This also nicely cleans up check_cond_jmp_op().

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-15-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 49 ++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c5d187d43fa1c..d5213cef53891 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14167,6 +14167,19 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
+{
+	return reg->type == SCALAR_VALUE &&
+	       tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
+}
+
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
+{
+	return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
+}
+
 /*
  * <reg1> <op> <reg2>, currently assuming reg2 is a constant
  */
@@ -14408,12 +14421,20 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
 static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
 			   u8 opcode, bool is_jmp32)
 {
-	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
 	u64 val;
 
-	if (!tnum_is_const(reg2_tnum))
+	if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
+		return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
+
+	/* try to make sure reg2 is a constant SCALAR_VALUE */
+	if (!is_reg_const(reg2, is_jmp32)) {
+		opcode = flip_opcode(opcode);
+		swap(reg1, reg2);
+	}
+	/* for now we expect reg2 to be a constant to make any useful decisions */
+	if (!is_reg_const(reg2, is_jmp32))
 		return -1;
-	val = reg2_tnum.value;
+	val = reg_const_value(reg2, is_jmp32);
 
 	if (__is_pointer_value(false, reg1)) {
 		if (!reg_not_null(reg1))
@@ -14894,27 +14915,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	}
 
 	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-
-	if (BPF_SRC(insn->code) == BPF_K) {
-		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
-	} else if (src_reg->type == SCALAR_VALUE &&
-		   is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
-		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
-	} else if (src_reg->type == SCALAR_VALUE &&
-		   !is_jmp32 && tnum_is_const(src_reg->var_off)) {
-		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
-	} else if (dst_reg->type == SCALAR_VALUE &&
-		   is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
-		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
-	} else if (dst_reg->type == SCALAR_VALUE &&
-		   !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
-		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
-	} else if (reg_is_pkt_pointer_any(dst_reg) &&
-		   reg_is_pkt_pointer_any(src_reg) &&
-		   !is_jmp32) {
-		pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
-	}
-
+	pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	if (pred >= 0) {
 		/* If we get here with a dst_reg pointer type it is because
 		 * above is_branch_taken() special cased the 0 comparison.

From 761a9e560d0ca0a6a2235c426cca80926672467e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:57 -0700
Subject: [PATCH 144/304] bpf: unify 32-bit and 64-bit is_branch_taken logic

Combine 32-bit and 64-bit is_branch_taken logic for SCALAR_VALUE
registers. It makes it easier to see parallels between two domains
(32-bit and 64-bit), and makes subsequent refactoring more
straightforward.

No functional changes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-16-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 200 +++++++++++++-----------------------------
 1 file changed, 59 insertions(+), 141 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d5213cef53891..b077dd99b1592 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14183,166 +14183,86 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
 /*
  * <reg1> <op> <reg2>, currently assuming reg2 is a constant
  */
-static int is_branch32_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
+static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+				  u8 opcode, bool is_jmp32)
 {
-	struct tnum subreg = tnum_subreg(reg1->var_off);
-	u32 val = (u32)tnum_subreg(reg2->var_off).value;
-	s32 sval = (s32)val;
+	struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+	u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
+	u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
+	s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
+	s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
+	u64 uval = is_jmp32 ? (u32)tnum_subreg(reg2->var_off).value : reg2->var_off.value;
+	s64 sval = is_jmp32 ? (s32)uval : (s64)uval;
 
 	switch (opcode) {
 	case BPF_JEQ:
-		if (tnum_is_const(subreg))
-			return !!tnum_equals_const(subreg, val);
-		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
+		if (tnum_is_const(t1))
+			return !!tnum_equals_const(t1, uval);
+		else if (uval < umin1 || uval > umax1)
 			return 0;
-		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
+		else if (sval < smin1 || sval > smax1)
 			return 0;
 		break;
 	case BPF_JNE:
-		if (tnum_is_const(subreg))
-			return !tnum_equals_const(subreg, val);
-		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
+		if (tnum_is_const(t1))
+			return !tnum_equals_const(t1, uval);
+		else if (uval < umin1 || uval > umax1)
 			return 1;
-		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
+		else if (sval < smin1 || sval > smax1)
 			return 1;
 		break;
 	case BPF_JSET:
-		if ((~subreg.mask & subreg.value) & val)
+		if ((~t1.mask & t1.value) & uval)
 			return 1;
-		if (!((subreg.mask | subreg.value) & val))
+		if (!((t1.mask | t1.value) & uval))
 			return 0;
 		break;
 	case BPF_JGT:
-		if (reg1->u32_min_value > val)
+		if (umin1 > uval )
 			return 1;
-		else if (reg1->u32_max_value <= val)
+		else if (umax1 <= uval)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (reg1->s32_min_value > sval)
+		if (smin1 > sval)
 			return 1;
-		else if (reg1->s32_max_value <= sval)
+		else if (smax1 <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (reg1->u32_max_value < val)
+		if (umax1 < uval)
 			return 1;
-		else if (reg1->u32_min_value >= val)
+		else if (umin1 >= uval)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (reg1->s32_max_value < sval)
+		if (smax1 < sval)
 			return 1;
-		else if (reg1->s32_min_value >= sval)
+		else if (smin1 >= sval)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (reg1->u32_min_value >= val)
+		if (umin1 >= uval)
 			return 1;
-		else if (reg1->u32_max_value < val)
+		else if (umax1 < uval)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (reg1->s32_min_value >= sval)
+		if (smin1 >= sval)
 			return 1;
-		else if (reg1->s32_max_value < sval)
+		else if (smax1 < sval)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (reg1->u32_max_value <= val)
+		if (umax1 <= uval)
 			return 1;
-		else if (reg1->u32_min_value > val)
+		else if (umin1 > uval)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (reg1->s32_max_value <= sval)
+		if (smax1 <= sval)
 			return 1;
-		else if (reg1->s32_min_value > sval)
-			return 0;
-		break;
-	}
-
-	return -1;
-}
-
-
-/*
- * <reg1> <op> <reg2>, currently assuming reg2 is a constant
- */
-static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
-{
-	u64 val = reg2->var_off.value;
-	s64 sval = (s64)val;
-
-	switch (opcode) {
-	case BPF_JEQ:
-		if (tnum_is_const(reg1->var_off))
-			return !!tnum_equals_const(reg1->var_off, val);
-		else if (val < reg1->umin_value || val > reg1->umax_value)
-			return 0;
-		else if (sval < reg1->smin_value || sval > reg1->smax_value)
-			return 0;
-		break;
-	case BPF_JNE:
-		if (tnum_is_const(reg1->var_off))
-			return !tnum_equals_const(reg1->var_off, val);
-		else if (val < reg1->umin_value || val > reg1->umax_value)
-			return 1;
-		else if (sval < reg1->smin_value || sval > reg1->smax_value)
-			return 1;
-		break;
-	case BPF_JSET:
-		if ((~reg1->var_off.mask & reg1->var_off.value) & val)
-			return 1;
-		if (!((reg1->var_off.mask | reg1->var_off.value) & val))
-			return 0;
-		break;
-	case BPF_JGT:
-		if (reg1->umin_value > val)
-			return 1;
-		else if (reg1->umax_value <= val)
-			return 0;
-		break;
-	case BPF_JSGT:
-		if (reg1->smin_value > sval)
-			return 1;
-		else if (reg1->smax_value <= sval)
-			return 0;
-		break;
-	case BPF_JLT:
-		if (reg1->umax_value < val)
-			return 1;
-		else if (reg1->umin_value >= val)
-			return 0;
-		break;
-	case BPF_JSLT:
-		if (reg1->smax_value < sval)
-			return 1;
-		else if (reg1->smin_value >= sval)
-			return 0;
-		break;
-	case BPF_JGE:
-		if (reg1->umin_value >= val)
-			return 1;
-		else if (reg1->umax_value < val)
-			return 0;
-		break;
-	case BPF_JSGE:
-		if (reg1->smin_value >= sval)
-			return 1;
-		else if (reg1->smax_value < sval)
-			return 0;
-		break;
-	case BPF_JLE:
-		if (reg1->umax_value <= val)
-			return 1;
-		else if (reg1->umin_value > val)
-			return 0;
-		break;
-	case BPF_JSLE:
-		if (reg1->smax_value <= sval)
-			return 1;
-		else if (reg1->smin_value > sval)
+		else if (smin1 > sval)
 			return 0;
 		break;
 	}
@@ -14456,9 +14376,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 		}
 	}
 
-	if (is_jmp32)
-		return is_branch32_taken(reg1, reg2, opcode);
-	return is_branch64_taken(reg1, reg2, opcode);
+	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
 /* Adjusts the register min/max values in the case that the dst_reg is the
@@ -14468,15 +14386,15 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
  */
 static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg,
-			    u64 val, u32 val32,
+			    u64 uval, u32 uval32,
 			    u8 opcode, bool is_jmp32)
 {
 	struct tnum false_32off = tnum_subreg(false_reg->var_off);
 	struct tnum false_64off = false_reg->var_off;
 	struct tnum true_32off = tnum_subreg(true_reg->var_off);
 	struct tnum true_64off = true_reg->var_off;
-	s64 sval = (s64)val;
-	s32 sval32 = (s32)val32;
+	s64 sval = (s64)uval;
+	s32 sval32 = (s32)uval32;
 
 	/* If the dst_reg is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless src_reg were a pointer into
@@ -14499,49 +14417,49 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	 */
 	case BPF_JEQ:
 		if (is_jmp32) {
-			__mark_reg32_known(true_reg, val32);
+			__mark_reg32_known(true_reg, uval32);
 			true_32off = tnum_subreg(true_reg->var_off);
 		} else {
-			___mark_reg_known(true_reg, val);
+			___mark_reg_known(true_reg, uval);
 			true_64off = true_reg->var_off;
 		}
 		break;
 	case BPF_JNE:
 		if (is_jmp32) {
-			__mark_reg32_known(false_reg, val32);
+			__mark_reg32_known(false_reg, uval32);
 			false_32off = tnum_subreg(false_reg->var_off);
 		} else {
-			___mark_reg_known(false_reg, val);
+			___mark_reg_known(false_reg, uval);
 			false_64off = false_reg->var_off;
 		}
 		break;
 	case BPF_JSET:
 		if (is_jmp32) {
-			false_32off = tnum_and(false_32off, tnum_const(~val32));
-			if (is_power_of_2(val32))
+			false_32off = tnum_and(false_32off, tnum_const(~uval32));
+			if (is_power_of_2(uval32))
 				true_32off = tnum_or(true_32off,
-						     tnum_const(val32));
+						     tnum_const(uval32));
 		} else {
-			false_64off = tnum_and(false_64off, tnum_const(~val));
-			if (is_power_of_2(val))
+			false_64off = tnum_and(false_64off, tnum_const(~uval));
+			if (is_power_of_2(uval))
 				true_64off = tnum_or(true_64off,
-						     tnum_const(val));
+						     tnum_const(uval));
 		}
 		break;
 	case BPF_JGE:
 	case BPF_JGT:
 	{
 		if (is_jmp32) {
-			u32 false_umax = opcode == BPF_JGT ? val32  : val32 - 1;
-			u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
+			u32 false_umax = opcode == BPF_JGT ? uval32  : uval32 - 1;
+			u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32;
 
 			false_reg->u32_max_value = min(false_reg->u32_max_value,
 						       false_umax);
 			true_reg->u32_min_value = max(true_reg->u32_min_value,
 						      true_umin);
 		} else {
-			u64 false_umax = opcode == BPF_JGT ? val    : val - 1;
-			u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+			u64 false_umax = opcode == BPF_JGT ? uval    : uval - 1;
+			u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval;
 
 			false_reg->umax_value = min(false_reg->umax_value, false_umax);
 			true_reg->umin_value = max(true_reg->umin_value, true_umin);
@@ -14570,16 +14488,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	case BPF_JLT:
 	{
 		if (is_jmp32) {
-			u32 false_umin = opcode == BPF_JLT ? val32  : val32 + 1;
-			u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
+			u32 false_umin = opcode == BPF_JLT ? uval32  : uval32 + 1;
+			u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32;
 
 			false_reg->u32_min_value = max(false_reg->u32_min_value,
 						       false_umin);
 			true_reg->u32_max_value = min(true_reg->u32_max_value,
 						      true_umax);
 		} else {
-			u64 false_umin = opcode == BPF_JLT ? val    : val + 1;
-			u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+			u64 false_umin = opcode == BPF_JLT ? uval    : uval + 1;
+			u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval;
 
 			false_reg->umin_value = max(false_reg->umin_value, false_umin);
 			true_reg->umax_value = min(true_reg->umax_value, true_umax);
@@ -14628,7 +14546,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
  */
 static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg,
-				u64 val, u32 val32,
+				u64 uval, u32 uval32,
 				u8 opcode, bool is_jmp32)
 {
 	opcode = flip_opcode(opcode);
@@ -14636,7 +14554,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	 * BPF_JA, can't get here.
 	 */
 	if (opcode)
-		reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
+		reg_set_min_max(true_reg, false_reg, uval, uval32, opcode, is_jmp32);
 }
 
 /* Regs are known to be equal, so intersect their min/max/var_off */

From 4c617286771ed129dddbeb7d15cb1cc63b2ba08a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:58 -0700
Subject: [PATCH 145/304] bpf: prepare reg_set_min_max for second set of
 registers

Similarly to is_branch_taken()-related refactorings, start preparing
reg_set_min_max() to handle more generic case of two non-const
registers. Start with renaming arguments to accommodate later addition
of second register as an input argument.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-17-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 80 +++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b077dd99b1592..438bf96b1c2d1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14384,25 +14384,25 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
  * simply doing a BPF_K check.
  * In JEQ/JNE cases we also adjust the var_off values.
  */
-static void reg_set_min_max(struct bpf_reg_state *true_reg,
-			    struct bpf_reg_state *false_reg,
+static void reg_set_min_max(struct bpf_reg_state *true_reg1,
+			    struct bpf_reg_state *false_reg1,
 			    u64 uval, u32 uval32,
 			    u8 opcode, bool is_jmp32)
 {
-	struct tnum false_32off = tnum_subreg(false_reg->var_off);
-	struct tnum false_64off = false_reg->var_off;
-	struct tnum true_32off = tnum_subreg(true_reg->var_off);
-	struct tnum true_64off = true_reg->var_off;
+	struct tnum false_32off = tnum_subreg(false_reg1->var_off);
+	struct tnum false_64off = false_reg1->var_off;
+	struct tnum true_32off = tnum_subreg(true_reg1->var_off);
+	struct tnum true_64off = true_reg1->var_off;
 	s64 sval = (s64)uval;
 	s32 sval32 = (s32)uval32;
 
 	/* If the dst_reg is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless src_reg were a pointer into
 	 * the same object, but we don't bother with that.
-	 * Since false_reg and true_reg have the same type by construction, we
+	 * Since false_reg1 and true_reg1 have the same type by construction, we
 	 * only need to check one of them for pointerness.
 	 */
-	if (__is_pointer_value(false, false_reg))
+	if (__is_pointer_value(false, false_reg1))
 		return;
 
 	switch (opcode) {
@@ -14417,20 +14417,20 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	 */
 	case BPF_JEQ:
 		if (is_jmp32) {
-			__mark_reg32_known(true_reg, uval32);
-			true_32off = tnum_subreg(true_reg->var_off);
+			__mark_reg32_known(true_reg1, uval32);
+			true_32off = tnum_subreg(true_reg1->var_off);
 		} else {
-			___mark_reg_known(true_reg, uval);
-			true_64off = true_reg->var_off;
+			___mark_reg_known(true_reg1, uval);
+			true_64off = true_reg1->var_off;
 		}
 		break;
 	case BPF_JNE:
 		if (is_jmp32) {
-			__mark_reg32_known(false_reg, uval32);
-			false_32off = tnum_subreg(false_reg->var_off);
+			__mark_reg32_known(false_reg1, uval32);
+			false_32off = tnum_subreg(false_reg1->var_off);
 		} else {
-			___mark_reg_known(false_reg, uval);
-			false_64off = false_reg->var_off;
+			___mark_reg_known(false_reg1, uval);
+			false_64off = false_reg1->var_off;
 		}
 		break;
 	case BPF_JSET:
@@ -14453,16 +14453,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			u32 false_umax = opcode == BPF_JGT ? uval32  : uval32 - 1;
 			u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32;
 
-			false_reg->u32_max_value = min(false_reg->u32_max_value,
+			false_reg1->u32_max_value = min(false_reg1->u32_max_value,
 						       false_umax);
-			true_reg->u32_min_value = max(true_reg->u32_min_value,
+			true_reg1->u32_min_value = max(true_reg1->u32_min_value,
 						      true_umin);
 		} else {
 			u64 false_umax = opcode == BPF_JGT ? uval    : uval - 1;
 			u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval;
 
-			false_reg->umax_value = min(false_reg->umax_value, false_umax);
-			true_reg->umin_value = max(true_reg->umin_value, true_umin);
+			false_reg1->umax_value = min(false_reg1->umax_value, false_umax);
+			true_reg1->umin_value = max(true_reg1->umin_value, true_umin);
 		}
 		break;
 	}
@@ -14473,14 +14473,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			s32 false_smax = opcode == BPF_JSGT ? sval32    : sval32 - 1;
 			s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
 
-			false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
-			true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
+			false_reg1->s32_max_value = min(false_reg1->s32_max_value, false_smax);
+			true_reg1->s32_min_value = max(true_reg1->s32_min_value, true_smin);
 		} else {
 			s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
 			s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
 
-			false_reg->smax_value = min(false_reg->smax_value, false_smax);
-			true_reg->smin_value = max(true_reg->smin_value, true_smin);
+			false_reg1->smax_value = min(false_reg1->smax_value, false_smax);
+			true_reg1->smin_value = max(true_reg1->smin_value, true_smin);
 		}
 		break;
 	}
@@ -14491,16 +14491,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			u32 false_umin = opcode == BPF_JLT ? uval32  : uval32 + 1;
 			u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32;
 
-			false_reg->u32_min_value = max(false_reg->u32_min_value,
+			false_reg1->u32_min_value = max(false_reg1->u32_min_value,
 						       false_umin);
-			true_reg->u32_max_value = min(true_reg->u32_max_value,
+			true_reg1->u32_max_value = min(true_reg1->u32_max_value,
 						      true_umax);
 		} else {
 			u64 false_umin = opcode == BPF_JLT ? uval    : uval + 1;
 			u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval;
 
-			false_reg->umin_value = max(false_reg->umin_value, false_umin);
-			true_reg->umax_value = min(true_reg->umax_value, true_umax);
+			false_reg1->umin_value = max(false_reg1->umin_value, false_umin);
+			true_reg1->umax_value = min(true_reg1->umax_value, true_umax);
 		}
 		break;
 	}
@@ -14511,14 +14511,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			s32 false_smin = opcode == BPF_JSLT ? sval32    : sval32 + 1;
 			s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
 
-			false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
-			true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
+			false_reg1->s32_min_value = max(false_reg1->s32_min_value, false_smin);
+			true_reg1->s32_max_value = min(true_reg1->s32_max_value, true_smax);
 		} else {
 			s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
 			s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
 
-			false_reg->smin_value = max(false_reg->smin_value, false_smin);
-			true_reg->smax_value = min(true_reg->smax_value, true_smax);
+			false_reg1->smin_value = max(false_reg1->smin_value, false_smin);
+			true_reg1->smax_value = min(true_reg1->smax_value, true_smax);
 		}
 		break;
 	}
@@ -14527,17 +14527,17 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	}
 
 	if (is_jmp32) {
-		false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
+		false_reg1->var_off = tnum_or(tnum_clear_subreg(false_64off),
 					     tnum_subreg(false_32off));
-		true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
+		true_reg1->var_off = tnum_or(tnum_clear_subreg(true_64off),
 					    tnum_subreg(true_32off));
-		reg_bounds_sync(false_reg);
-		reg_bounds_sync(true_reg);
+		reg_bounds_sync(false_reg1);
+		reg_bounds_sync(true_reg1);
 	} else {
-		false_reg->var_off = false_64off;
-		true_reg->var_off = true_64off;
-		reg_bounds_sync(false_reg);
-		reg_bounds_sync(true_reg);
+		false_reg1->var_off = false_64off;
+		true_reg1->var_off = true_64off;
+		reg_bounds_sync(false_reg1);
+		reg_bounds_sync(true_reg1);
 	}
 }
 

From 9a14d62a2cdb33e983cddf2e994b9766c54ceedb Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:59 -0700
Subject: [PATCH 146/304] bpf: generalize reg_set_min_max() to handle two sets
 of two registers

Change reg_set_min_max() to take FALSE/TRUE sets of two registers each,
instead of assuming that we are always comparing to a constant. For now
we still assume that right-hand side registers are constants (and make
sure that's the case by swapping src/dst regs, if necessary), but
subsequent patches will remove this limitation.

reg_set_min_max() is now called unconditionally for any register
comparison, so that might include pointer vs pointer. This makes it
consistent with is_branch_taken() generality. But we currently only
support adjustments based on SCALAR vs SCALAR comparisons, so
reg_set_min_max() has to guard itself againts pointers.

Taking two by two registers allows to further unify and simplify
check_cond_jmp_op() logic. We utilize fake register for BPF_K
conditional jump case, just like with is_branch_taken() part.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-18-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 131 ++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 75 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 438bf96b1c2d1..2197385d91dc6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14379,32 +14379,50 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
-/* Adjusts the register min/max values in the case that the dst_reg is the
- * variable register that we are working on, and src_reg is a constant or we're
- * simply doing a BPF_K check.
- * In JEQ/JNE cases we also adjust the var_off values.
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
  */
 static void reg_set_min_max(struct bpf_reg_state *true_reg1,
+			    struct bpf_reg_state *true_reg2,
 			    struct bpf_reg_state *false_reg1,
-			    u64 uval, u32 uval32,
+			    struct bpf_reg_state *false_reg2,
 			    u8 opcode, bool is_jmp32)
 {
-	struct tnum false_32off = tnum_subreg(false_reg1->var_off);
-	struct tnum false_64off = false_reg1->var_off;
-	struct tnum true_32off = tnum_subreg(true_reg1->var_off);
-	struct tnum true_64off = true_reg1->var_off;
-	s64 sval = (s64)uval;
-	s32 sval32 = (s32)uval32;
-
-	/* If the dst_reg is a pointer, we can't learn anything about its
-	 * variable offset from the compare (unless src_reg were a pointer into
-	 * the same object, but we don't bother with that.
-	 * Since false_reg1 and true_reg1 have the same type by construction, we
-	 * only need to check one of them for pointerness.
+	struct tnum false_32off, false_64off;
+	struct tnum true_32off, true_64off;
+	u64 uval;
+	u32 uval32;
+	s64 sval;
+	s32 sval32;
+
+	/* If either register is a pointer, we can't learn anything about its
+	 * variable offset from the compare (unless they were a pointer into
+	 * the same object, but we don't bother with that).
 	 */
-	if (__is_pointer_value(false, false_reg1))
+	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+		return;
+
+	/* we expect right-hand registers (src ones) to be constants, for now */
+	if (!is_reg_const(false_reg2, is_jmp32)) {
+		opcode = flip_opcode(opcode);
+		swap(true_reg1, true_reg2);
+		swap(false_reg1, false_reg2);
+	}
+	if (!is_reg_const(false_reg2, is_jmp32))
 		return;
 
+	false_32off = tnum_subreg(false_reg1->var_off);
+	false_64off = false_reg1->var_off;
+	true_32off = tnum_subreg(true_reg1->var_off);
+	true_64off = true_reg1->var_off;
+	uval = false_reg2->var_off.value;
+	uval32 = (u32)tnum_subreg(false_reg2->var_off).value;
+	sval = (s64)uval;
+	sval32 = (s32)uval32;
+
 	switch (opcode) {
 	/* JEQ/JNE comparison doesn't change the register equivalence.
 	 *
@@ -14541,22 +14559,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg1,
 	}
 }
 
-/* Same as above, but for the case that dst_reg holds a constant and src_reg is
- * the variable reg.
- */
-static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
-				struct bpf_reg_state *false_reg,
-				u64 uval, u32 uval32,
-				u8 opcode, bool is_jmp32)
-{
-	opcode = flip_opcode(opcode);
-	/* This uses zero as "not present in table"; luckily the zero opcode,
-	 * BPF_JA, can't get here.
-	 */
-	if (opcode)
-		reg_set_min_max(true_reg, false_reg, uval, uval32, opcode, is_jmp32);
-}
-
 /* Regs are known to be equal, so intersect their min/max/var_off */
 static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
 				  struct bpf_reg_state *dst_reg)
@@ -14881,53 +14883,32 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		return -EFAULT;
 	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
 
-	/* detect if we are comparing against a constant value so we can adjust
-	 * our min/max values for our dst register.
-	 * this is only legit if both are scalars (or pointers to the same
-	 * object, I suppose, see the PTR_MAYBE_NULL related if block below),
-	 * because otherwise the different base pointers mean the offsets aren't
-	 * comparable.
-	 */
 	if (BPF_SRC(insn->code) == BPF_X) {
-		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+		reg_set_min_max(&other_branch_regs[insn->dst_reg],
+				&other_branch_regs[insn->src_reg],
+				dst_reg, src_reg, opcode, is_jmp32);
 
 		if (dst_reg->type == SCALAR_VALUE &&
-		    src_reg->type == SCALAR_VALUE) {
-			if (tnum_is_const(src_reg->var_off) ||
-			    (is_jmp32 &&
-			     tnum_is_const(tnum_subreg(src_reg->var_off))))
-				reg_set_min_max(&other_branch_regs[insn->dst_reg],
-						dst_reg,
-						src_reg->var_off.value,
-						tnum_subreg(src_reg->var_off).value,
-						opcode, is_jmp32);
-			else if (tnum_is_const(dst_reg->var_off) ||
-				 (is_jmp32 &&
-				  tnum_is_const(tnum_subreg(dst_reg->var_off))))
-				reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
-						    src_reg,
-						    dst_reg->var_off.value,
-						    tnum_subreg(dst_reg->var_off).value,
-						    opcode, is_jmp32);
-			else if (!is_jmp32 &&
-				 (opcode == BPF_JEQ || opcode == BPF_JNE))
-				/* Comparing for equality, we can combine knowledge */
-				reg_combine_min_max(&other_branch_regs[insn->src_reg],
-						    &other_branch_regs[insn->dst_reg],
-						    src_reg, dst_reg, opcode);
-			if (src_reg->id &&
-			    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
-				find_equal_scalars(this_branch, src_reg);
-				find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
-			}
-
-		}
-	} else if (dst_reg->type == SCALAR_VALUE) {
+		    src_reg->type == SCALAR_VALUE &&
+		    !is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+			/* Comparing for equality, we can combine knowledge */
+			reg_combine_min_max(&other_branch_regs[insn->src_reg],
+					    &other_branch_regs[insn->dst_reg],
+					    src_reg, dst_reg, opcode);
+		}
+	} else /* BPF_SRC(insn->code) == BPF_K */ {
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-					dst_reg, insn->imm, (u32)insn->imm,
-					opcode, is_jmp32);
+				src_reg /* fake one */,
+				dst_reg, src_reg /* same fake one */,
+				opcode, is_jmp32);
 	}
 
+	if (BPF_SRC(insn->code) == BPF_X &&
+	    src_reg->type == SCALAR_VALUE && src_reg->id &&
+	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+		find_equal_scalars(this_branch, src_reg);
+		find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+	}
 	if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
 	    !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
 		find_equal_scalars(this_branch, dst_reg);

From 3f1f234e677b81b2f57d5b01e55e0dff9871a986 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 1 Nov 2023 11:24:53 +0800
Subject: [PATCH 147/304] selftests/bpf: Use value with enough-size when
 updating per-cpu map

When updating per-cpu map in map_percpu_stats test, patch_map_thread()
only passes 4-bytes-sized value to bpf_map_update_elem(). The expected
size of the value is 8 * num_possible_cpus(), so fix it by passing a
value with enough-size for per-cpu map update.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231101032455.3808547-2-houtao@huaweicloud.com
---
 .../bpf/map_tests/map_percpu_stats.c          | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
index 1a9eeefda9a87..8ad17d051ef8f 100644
--- a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
+++ b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
@@ -131,6 +131,12 @@ static bool is_lru(__u32 map_type)
 	       map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 }
 
+static bool is_percpu(__u32 map_type)
+{
+	return map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	       map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
 struct upsert_opts {
 	__u32 map_type;
 	int map_fd;
@@ -150,17 +156,26 @@ static int create_small_hash(void)
 
 static void *patch_map_thread(void *arg)
 {
+	/* 8KB is enough for 1024 CPUs. And it is shared between N_THREADS. */
+	static __u8 blob[8 << 10];
 	struct upsert_opts *opts = arg;
+	void *val_ptr;
 	int val;
 	int ret;
 	int i;
 
 	for (i = 0; i < opts->n; i++) {
-		if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+		if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
 			val = create_small_hash();
-		else
+			val_ptr = &val;
+		} else if (is_percpu(opts->map_type)) {
+			val_ptr = blob;
+		} else {
 			val = rand();
-		ret = bpf_map_update_elem(opts->map_fd, &i, &val, 0);
+			val_ptr = &val;
+		}
+
+		ret = bpf_map_update_elem(opts->map_fd, &i, val_ptr, 0);
 		CHECK(ret < 0, "bpf_map_update_elem", "key=%d error: %s\n", i, strerror(errno));
 
 		if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)

From ff38534e8251cf42831c0d16df28ff9ca6c94e6d Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 1 Nov 2023 11:24:54 +0800
Subject: [PATCH 148/304] selftests/bpf: Export map_update_retriable()

Export map_update_retriable() to make it usable for other map_test
cases. These cases may only need retry for specific errno, so add
a new callback parameter to let map_update_retriable() decide whether or
not the errno is retriable.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231101032455.3808547-3-houtao@huaweicloud.com
---
 tools/testing/selftests/bpf/test_maps.c | 17 ++++++++++++-----
 tools/testing/selftests/bpf/test_maps.h |  5 +++++
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 7fc00e423e4dd..767e0693df106 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -1396,13 +1396,18 @@ static void test_map_stress(void)
 #define MAX_DELAY_US 50000
 #define MIN_DELAY_RANGE_US 5000
 
-static int map_update_retriable(int map_fd, const void *key, const void *value,
-				int flags, int attempts)
+static bool retry_for_again_or_busy(int err)
+{
+	return (err == EAGAIN || err == EBUSY);
+}
+
+int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts,
+			 retry_for_error_fn need_retry)
 {
 	int delay = rand() % MIN_DELAY_RANGE_US;
 
 	while (bpf_map_update_elem(map_fd, key, value, flags)) {
-		if (!attempts || (errno != EAGAIN && errno != EBUSY))
+		if (!attempts || !need_retry(errno))
 			return -errno;
 
 		if (delay <= MAX_DELAY_US / 2)
@@ -1445,11 +1450,13 @@ static void test_update_delete(unsigned int fn, void *data)
 		key = value = i;
 
 		if (do_update) {
-			err = map_update_retriable(fd, &key, &value, BPF_NOEXIST, MAP_RETRIES);
+			err = map_update_retriable(fd, &key, &value, BPF_NOEXIST, MAP_RETRIES,
+						   retry_for_again_or_busy);
 			if (err)
 				printf("error %d %d\n", err, errno);
 			assert(err == 0);
-			err = map_update_retriable(fd, &key, &value, BPF_EXIST, MAP_RETRIES);
+			err = map_update_retriable(fd, &key, &value, BPF_EXIST, MAP_RETRIES,
+						   retry_for_again_or_busy);
 			if (err)
 				printf("error %d %d\n", err, errno);
 			assert(err == 0);
diff --git a/tools/testing/selftests/bpf/test_maps.h b/tools/testing/selftests/bpf/test_maps.h
index f6fbca761732f..e4ac704a536c1 100644
--- a/tools/testing/selftests/bpf/test_maps.h
+++ b/tools/testing/selftests/bpf/test_maps.h
@@ -4,6 +4,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 
 #define CHECK(condition, tag, format...) ({				\
 	int __ret = !!(condition);					\
@@ -16,4 +17,8 @@
 
 extern int skips;
 
+typedef bool (*retry_for_error_fn)(int err);
+int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts,
+			 retry_for_error_fn need_retry);
+
 #endif

From 57688b2a543be340d80c568b7f91c0660bb65c2f Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 1 Nov 2023 11:24:55 +0800
Subject: [PATCH 149/304] selftsets/bpf: Retry map update for non-preallocated
 per-cpu map

BPF CI failed due to map_percpu_stats_percpu_hash from time to time [1].
It seems that the failure reason is per-cpu bpf memory allocator may not
be able to allocate per-cpu pointer successfully and it can not refill
free llist timely, and bpf_map_update_elem() will return -ENOMEM.

So mitigate the problem by retrying the update operation for
non-preallocated per-cpu map.

[1]: https://github.com/kernel-patches/bpf/actions/runs/6713177520/job/18244865326?pr=5909

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231101032455.3808547-4-houtao@huaweicloud.com
---
 .../bpf/map_tests/map_percpu_stats.c          | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
index 8ad17d051ef8f..e152535e9e3ec 100644
--- a/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
+++ b/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
@@ -141,6 +141,7 @@ struct upsert_opts {
 	__u32 map_type;
 	int map_fd;
 	__u32 n;
+	bool retry_for_nomem;
 };
 
 static int create_small_hash(void)
@@ -154,6 +155,11 @@ static int create_small_hash(void)
 	return map_fd;
 }
 
+static bool retry_for_nomem_fn(int err)
+{
+	return err == ENOMEM;
+}
+
 static void *patch_map_thread(void *arg)
 {
 	/* 8KB is enough for 1024 CPUs. And it is shared between N_THREADS. */
@@ -175,7 +181,12 @@ static void *patch_map_thread(void *arg)
 			val_ptr = &val;
 		}
 
-		ret = bpf_map_update_elem(opts->map_fd, &i, val_ptr, 0);
+		/* 2 seconds may be enough ? */
+		if (opts->retry_for_nomem)
+			ret = map_update_retriable(opts->map_fd, &i, val_ptr, 0,
+						   40, retry_for_nomem_fn);
+		else
+			ret = bpf_map_update_elem(opts->map_fd, &i, val_ptr, 0);
 		CHECK(ret < 0, "bpf_map_update_elem", "key=%d error: %s\n", i, strerror(errno));
 
 		if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
@@ -296,6 +307,13 @@ static void __test(int map_fd)
 	else
 		opts.n /= 2;
 
+	/* per-cpu bpf memory allocator may not be able to allocate per-cpu
+	 * pointer successfully and it can not refill free llist timely, and
+	 * bpf_map_update_elem() will return -ENOMEM. so just retry to mitigate
+	 * the problem temporarily.
+	 */
+	opts.retry_for_nomem = is_percpu(opts.map_type) && (info.map_flags & BPF_F_NO_PREALLOC);
+
 	/*
 	 * Upsert keys [0, n) under some competition: with random values from
 	 * N_THREADS threads. Check values, then delete all elements and check

From 1a119e269dc69e82217525d92a93e082c4424fc8 Mon Sep 17 00:00:00 2001
From: Manu Bretelle <chantr4@gmail.com>
Date: Tue, 31 Oct 2023 14:27:17 -0700
Subject: [PATCH 150/304] selftests/bpf: Consolidate VIRTIO/9P configs in
 config.vm file

Those configs are needed to be able to run VM somewhat consistently.
For instance, ATM, s390x is missing the `CONFIG_VIRTIO_CONSOLE` which
prevents s390x kernels built in CI to leverage qemu-guest-agent.

By moving them to `config,vm`, we should have selftest kernels which are
equal in term of VM functionalities when they include this file.

The set of config unabled were picked using

    grep -h -E '(_9P|_VIRTIO)' config.x86_64 config | sort | uniq

added to `config.vm` and then
    grep -vE '(_9P|_VIRTIO)' config.{x86_64,aarch64,s390x}

as a side-effect, some config may have disappeared to the aarch64 and
s390x kernels, but they should not be needed. CI will tell.

Signed-off-by: Manu Bretelle <chantr4@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231031212717.4037892-1-chantr4@gmail.com
---
 tools/testing/selftests/bpf/config.aarch64 | 16 ----------------
 tools/testing/selftests/bpf/config.s390x   |  9 ---------
 tools/testing/selftests/bpf/config.vm      | 12 ++++++++++++
 tools/testing/selftests/bpf/config.x86_64  | 12 ------------
 tools/testing/selftests/bpf/vmtest.sh      |  4 +++-
 5 files changed, 15 insertions(+), 38 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/config.vm

diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64
index 2538214948848..fa8ecf626c73e 100644
--- a/tools/testing/selftests/bpf/config.aarch64
+++ b/tools/testing/selftests/bpf/config.aarch64
@@ -1,4 +1,3 @@
-CONFIG_9P_FS=y
 CONFIG_ARCH_VEXPRESS=y
 CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
 CONFIG_ARM_SMMU_V3=y
@@ -46,7 +45,6 @@ CONFIG_DEBUG_SG=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_DEVTMPFS=y
-CONFIG_DRM_VIRTIO_GPU=y
 CONFIG_DRM=y
 CONFIG_DUMMY=y
 CONFIG_EXPERT=y
@@ -67,7 +65,6 @@ CONFIG_HAVE_KRETPROBES=y
 CONFIG_HEADERS_INSTALL=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HUGETLBFS=y
-CONFIG_HW_RANDOM_VIRTIO=y
 CONFIG_HW_RANDOM=y
 CONFIG_HZ_100=y
 CONFIG_IDLE_PAGE_TRACKING=y
@@ -99,8 +96,6 @@ CONFIG_MEMCG=y
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_NAMESPACES=y
-CONFIG_NET_9P_VIRTIO=y
-CONFIG_NET_9P=y
 CONFIG_NET_ACT_BPF=y
 CONFIG_NET_ACT_GACT=y
 CONFIG_NETDEVICES=y
@@ -140,7 +135,6 @@ CONFIG_SCHED_TRACER=y
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SCAN_ASYNC=y
-CONFIG_SCSI_VIRTIO=y
 CONFIG_SCSI=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
@@ -167,16 +161,6 @@ CONFIG_UPROBES=y
 CONFIG_USELIB=y
 CONFIG_USER_NS=y
 CONFIG_VETH=y
-CONFIG_VIRTIO_BALLOON=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_VIRTIO_FS=y
-CONFIG_VIRTIO_INPUT=y
-CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
-CONFIG_VIRTIO_MMIO=y
-CONFIG_VIRTIO_NET=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_VSOCKETS_COMMON=y
 CONFIG_VLAN_8021Q=y
 CONFIG_VSOCKETS=y
 CONFIG_VSOCKETS_LOOPBACK=y
diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x
index 2ba92167be358..e933303828494 100644
--- a/tools/testing/selftests/bpf/config.s390x
+++ b/tools/testing/selftests/bpf/config.s390x
@@ -1,4 +1,3 @@
-CONFIG_9P_FS=y
 CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
 CONFIG_AUDIT=y
 CONFIG_BLK_CGROUP=y
@@ -84,8 +83,6 @@ CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_NAMESPACES=y
 CONFIG_NET=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
 CONFIG_NET_ACT_BPF=y
 CONFIG_NET_ACT_GACT=y
 CONFIG_NET_KEY=y
@@ -114,7 +111,6 @@ CONFIG_SAMPLE_SECCOMP=y
 CONFIG_SAMPLES=y
 CONFIG_SCHED_TRACER=y
 CONFIG_SCSI=y
-CONFIG_SCSI_VIRTIO=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_STACK_TRACER=y
 CONFIG_STATIC_KEYS_SELFTEST=y
@@ -136,11 +132,6 @@ CONFIG_UPROBES=y
 CONFIG_USELIB=y
 CONFIG_USER_NS=y
 CONFIG_VETH=y
-CONFIG_VIRTIO_BALLOON=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_NET=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_VSOCKETS_COMMON=y
 CONFIG_VLAN_8021Q=y
 CONFIG_VSOCKETS=y
 CONFIG_VSOCKETS_LOOPBACK=y
diff --git a/tools/testing/selftests/bpf/config.vm b/tools/testing/selftests/bpf/config.vm
new file mode 100644
index 0000000000000..a9746ca787773
--- /dev/null
+++ b/tools/testing/selftests/bpf/config.vm
@@ -0,0 +1,12 @@
+CONFIG_9P_FS=y
+CONFIG_9P_FS_POSIX_ACL=y
+CONFIG_9P_FS_SECURITY=y
+CONFIG_CRYPTO_DEV_VIRTIO=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_VSOCKETS_COMMON=y
diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index 2e70a60482784..f7bfb2b09c82b 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -1,6 +1,3 @@
-CONFIG_9P_FS=y
-CONFIG_9P_FS_POSIX_ACL=y
-CONFIG_9P_FS_SECURITY=y
 CONFIG_AGP=y
 CONFIG_AGP_AMD64=y
 CONFIG_AGP_INTEL=y
@@ -45,7 +42,6 @@ CONFIG_CPU_IDLE_GOV_LADDER=y
 CONFIG_CPUSETS=y
 CONFIG_CRC_T10DIF=y
 CONFIG_CRYPTO_BLAKE2B=y
-CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_XXHASH=y
 CONFIG_DCB=y
@@ -145,8 +141,6 @@ CONFIG_MEMORY_FAILURE=y
 CONFIG_MINIX_SUBPARTITION=y
 CONFIG_NAMESPACES=y
 CONFIG_NET=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
 CONFIG_NET_ACT_BPF=y
 CONFIG_NET_CLS_CGROUP=y
 CONFIG_NET_EMATCH=y
@@ -228,12 +222,6 @@ CONFIG_USER_NS=y
 CONFIG_VALIDATE_FS_PARSER=y
 CONFIG_VETH=y
 CONFIG_VIRT_DRIVERS=y
-CONFIG_VIRTIO_BALLOON=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_VIRTIO_NET=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_VSOCKETS_COMMON=y
 CONFIG_VLAN_8021Q=y
 CONFIG_VSOCKETS=y
 CONFIG_VSOCKETS_LOOPBACK=y
diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 6850345280184..65d14f3bbe301 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -36,7 +36,9 @@ DEFAULT_COMMAND="./test_progs"
 MOUNT_DIR="mnt"
 ROOTFS_IMAGE="root.img"
 OUTPUT_DIR="$HOME/.bpf_selftests"
-KCONFIG_REL_PATHS=("tools/testing/selftests/bpf/config" "tools/testing/selftests/bpf/config.${ARCH}")
+KCONFIG_REL_PATHS=("tools/testing/selftests/bpf/config"
+	"tools/testing/selftests/bpf/config.vm"
+	"tools/testing/selftests/bpf/config.${ARCH}")
 INDEX_URL="https://raw.githubusercontent.com/libbpf/ci/master/INDEX"
 NUM_COMPILE_JOBS="$(nproc)"
 LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")"

From cf37d0affb2c2d471965d1139195bba6e75a0772 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Nov 2023 12:23:40 -1000
Subject: [PATCH 151/304] ravg: Drop scx_ prefix and implement ravg_transfer().
 Misc rusty changes.

---
 .../sched_ext/{scx_ravg.bpf.h => ravg.bpf.h}  |  0
 .../{scx_ravg_impl.bpf.h => ravg_impl.bpf.h}  | 42 ++++++++++++++++++-
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 12 +++---
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |  2 +-
 4 files changed, 47 insertions(+), 9 deletions(-)
 rename tools/sched_ext/{scx_ravg.bpf.h => ravg.bpf.h} (100%)
 rename tools/sched_ext/{scx_ravg_impl.bpf.h => ravg_impl.bpf.h} (87%)

diff --git a/tools/sched_ext/scx_ravg.bpf.h b/tools/sched_ext/ravg.bpf.h
similarity index 100%
rename from tools/sched_ext/scx_ravg.bpf.h
rename to tools/sched_ext/ravg.bpf.h
diff --git a/tools/sched_ext/scx_ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h
similarity index 87%
rename from tools/sched_ext/scx_ravg_impl.bpf.h
rename to tools/sched_ext/ravg_impl.bpf.h
index 245b0671e4386..3bdee68e0eb57 100644
--- a/tools/sched_ext/scx_ravg_impl.bpf.h
+++ b/tools/sched_ext/ravg_impl.bpf.h
@@ -1,5 +1,5 @@
 /* to be included in the main bpf.c file */
-#include "scx_ravg.bpf.h"
+#include "ravg.bpf.h"
 
 #define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
 //#define RAVG_FN_ATTRS		__attribute__((unused))
@@ -144,6 +144,43 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
 	rd->val_at = now;
 }
 
+/**
+ * ravg_transfer - Transfer in or out a component running avg
+ * @base: ravg_data to transfer @xfer into or out of
+ * @xfer: ravg_data to transfer
+ * @is_xfer_in: transfer direction
+ *
+ * An ravg may be a sum of component ravgs. For example, a scheduling domain's
+ * load is the sum of the load values of all member tasks. If a task is migrated
+ * to a different domain, its contribution should be subtracted from the source
+ * ravg and added to the destination one.
+ *
+ * This function can be used for such component transfers. Both @base and @xfer
+ * must have been accumulated at the same timestamp. @xfer's contribution is
+ * subtracted if @is_fer_in is %false and added if %true.
+ */
+static RAVG_FN_ATTRS int ravg_transfer(struct ravg_data *base, struct ravg_data *xfer,
+				       bool is_xfer_in)
+{
+	if (base->val_at != xfer->val_at)
+		return -EINVAL;
+
+	if (is_xfer_in) {
+		base->old += xfer->old;
+		base->cur += xfer->cur;
+	} else {
+		if (base->old > xfer->old)
+			base->old -= xfer->old;
+		else
+			base->old = 0;
+
+		if (base->cur > xfer->cur)
+			base->cur -= xfer->cur;
+		else
+			base->cur = 0;
+	}
+}
+
 /**
  * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift)
  * @a: multiplicand
@@ -154,7 +191,8 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
  * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must
  * ensure that the final shifted result fits in u64.
  */
-static __u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
+static inline __attribute__((always_inline))
+__u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
 {
 	const __u64 mask32 = (__u32)-1;
 	__u64 al = a & mask32;
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 9cb5e8fd46200..0dac13604b8d2 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -36,7 +36,7 @@
  * load balance based on userspace populating the lb_data map.
  */
 #include "../../../scx_common.bpf.h"
-#include "../../../scx_ravg_impl.bpf.h"
+#include "../../../ravg_impl.bpf.h"
 #include "rusty.h"
 
 #include <errno.h>
@@ -114,7 +114,7 @@ struct {
 
 const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
 
-static void adj_dom_load(u32 dom_id, s64 adj, u64 now)
+static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 {
 	struct dom_load *load;
 
@@ -308,7 +308,7 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 		u64 now = bpf_ktime_get_ns();
 
 		if (taskc->runnable)
-			adj_dom_load(taskc->dom_id, -(s64)p->scx.weight, now);
+			dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
 
 		p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta;
 		taskc->dom_id = new_dom_id;
@@ -316,7 +316,7 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 				p->cpus_ptr);
 
 		if (taskc->runnable)
-			adj_dom_load(taskc->dom_id, p->scx.weight, now);
+			dom_load_adj(taskc->dom_id, p->scx.weight, now);
 	}
 
 	return taskc->dom_id == new_dom_id;
@@ -711,7 +711,7 @@ void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 	taskc->runnable_at = bpf_ktime_get_ns();
 	taskc->is_kworker = p->flags & PF_WQ_WORKER;
 
-	adj_dom_load(taskc->dom_id, p->scx.weight, now);
+	dom_load_adj(taskc->dom_id, p->scx.weight, now);
 }
 
 void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
@@ -772,7 +772,7 @@ void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 	taskc->runnable_for += now - taskc->runnable_at;
 	taskc->runnable_at = 0;
 
-	adj_dom_load(taskc->dom_id, -(s64)p->scx.weight, now);
+	dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
 }
 
 void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index d2efb06400d84..fc540e9261ec3 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -17,7 +17,7 @@ typedef unsigned char u8;
 typedef unsigned int u32;
 typedef unsigned long long u64;
 
-#include "../../../scx_ravg.bpf.h"
+#include "../../../ravg.bpf.h"
 
 #define	MAX_CPUS	512
 #define	MAX_DOMS	64		/* limited to avoid complex bitmask ops */

From 7dc6a8bc186779dde3696daab08f9268da97b549 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Nov 2023 13:12:23 -1000
Subject: [PATCH 152/304] scx_rusty: Implement task load transfers

---
 tools/sched_ext/ravg_impl.bpf.h               | 18 ++++++
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 56 ++++++++++++++++---
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |  2 +
 3 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h
index 3bdee68e0eb57..e2a73a93aa06a 100644
--- a/tools/sched_ext/ravg_impl.bpf.h
+++ b/tools/sched_ext/ravg_impl.bpf.h
@@ -179,6 +179,8 @@ static RAVG_FN_ATTRS int ravg_transfer(struct ravg_data *base, struct ravg_data
 		else
 			base->cur = 0;
 	}
+
+	return 0;
 }
 
 /**
@@ -223,6 +225,22 @@ __u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
 	return al + ah;
 }
 
+/**
+ * ravg_scale - Scale a running avg
+ * @rd: ravg_data to scale
+ * @mult: multipler
+ * @rshift: right shift amount
+ *
+ * Scale @rd by multiplying the tracked values by @mult and shifting right by
+ * @rshift.
+ */
+static RAVG_FN_ATTRS void ravg_scale(struct ravg_data *rd, u32 mult, u32 rshift)
+{
+	rd->val = u64_x_u32_rshift(rd->val, mult, rshift);
+	rd->old = u64_x_u32_rshift(rd->old, mult, rshift);
+	rd->cur = u64_x_u32_rshift(rd->cur, mult, rshift);
+}
+
 /**
  * ravg_read - Read the current running avg
  * @rd: ravg_data to read from
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 0dac13604b8d2..2261ee75f5233 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -101,7 +101,7 @@ struct {
 struct dom_load {
 	struct bpf_spin_lock lock;
 	u64 load;
-	struct ravg_data ravg_data;
+	struct ravg_data load_rd;
 };
 
 struct {
@@ -125,7 +125,7 @@ static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 
 	bpf_spin_lock(&load->lock);
 	load->load += adj;
-	ravg_accumulate(&load->ravg_data, load->load, now, USAGE_HALF_LIFE);
+	ravg_accumulate(&load->load_rd, load->load, now, USAGE_HALF_LIFE);
 	bpf_spin_unlock(&load->lock);
 
 	if (adj < 0 && (s64)load->load < 0)
@@ -133,6 +133,50 @@ static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 			      bpf_get_smp_processor_id(), dom_id, load->load, adj);
 }
 
+static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
+			       u32 from_dom_id, u32 to_dom_id, u64 now)
+{
+	struct dom_load *from_load, *to_load;
+	struct ravg_data task_load_rd;
+	int ret;
+
+	from_load = bpf_map_lookup_elem(&dom_load, &from_dom_id);
+	to_load = bpf_map_lookup_elem(&dom_load, &to_dom_id);
+	if (!from_load || !to_load) {
+		scx_bpf_error("dom_load lookup failed");
+		return;
+	}
+
+	/*
+	 * @p is moving from @from_dom_id to @to_dom_id. Its load contribution
+	 * should be moved together. We only track duty cycle for tasks. Scale
+	 * it by weight to get load_rd.
+	 */
+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
+	task_load_rd = taskc->dcyc_rd;
+	ravg_scale(&task_load_rd, p->scx.weight, 0);
+
+	/* transfer out of @from_dom_id */
+	bpf_spin_lock(&from_load->lock);
+	if (taskc->runnable)
+		from_load->load -= p->scx.weight;
+	ravg_accumulate(&from_load->load_rd, from_load->load, now, USAGE_HALF_LIFE);
+	ret = ravg_transfer(&from_load->load_rd, &task_load_rd, false);
+	bpf_spin_unlock(&from_load->lock);
+	if (ret < 0)
+		scx_bpf_error("Failed to transfer out load");
+
+	/* transfer into @to_dom_id */
+	bpf_spin_lock(&to_load->lock);
+	if (taskc->runnable)
+		to_load->load += p->scx.weight;
+	ravg_accumulate(&to_load->load_rd, to_load->load, now, USAGE_HALF_LIFE);
+	ret = ravg_transfer(&to_load->load_rd, &task_load_rd, true);
+	bpf_spin_unlock(&to_load->lock);
+	if (ret < 0)
+		scx_bpf_error("Failed to transfer in load");
+}
+
 /*
  * Statistics
  */
@@ -307,16 +351,12 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 				   p->cpus_ptr)) {
 		u64 now = bpf_ktime_get_ns();
 
-		if (taskc->runnable)
-			dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
+		dom_load_xfer_task(p, taskc, taskc->dom_id, new_dom_id, now);
 
 		p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta;
 		taskc->dom_id = new_dom_id;
 		bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask,
 				p->cpus_ptr);
-
-		if (taskc->runnable)
-			dom_load_adj(taskc->dom_id, p->scx.weight, now);
 	}
 
 	return taskc->dom_id == new_dom_id;
@@ -711,6 +751,7 @@ void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 	taskc->runnable_at = bpf_ktime_get_ns();
 	taskc->is_kworker = p->flags & PF_WQ_WORKER;
 
+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
 	dom_load_adj(taskc->dom_id, p->scx.weight, now);
 }
 
@@ -772,6 +813,7 @@ void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 	taskc->runnable_for += now - taskc->runnable_at;
 	taskc->runnable_at = 0;
 
+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
 	dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
 }
 
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index fc540e9261ec3..e8b6e5cba0a0c 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -68,6 +68,8 @@ struct task_ctx {
 
 	/* select_cpu() telling enqueue() to queue directly on the DSQ */
 	bool dispatch_local;
+
+	struct ravg_data dcyc_rd;
 };
 
 struct dom_ctx {

From 298bec15b85a19c74f19761113442b6cb881c56d Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 2 Nov 2023 22:52:46 -0500
Subject: [PATCH 153/304] scx: Fix skel and .bpf.o Make deps

With the recent Makefile refactor that puts all build artifacts into a
build/ directory output, there was a regression in that Make would now
always rebuild schedulers even if they were unchanged. This is happening
because when Make looks at a target, it looks to see if that file
exists. If it doesn't, it executes the target. There are a few targets
that are improperly tracked:

1. We were taking a dependency on the sched.skel.h target (e.g.
   scx_simple.skel.h). In the old build system this was an actual file,
   but now it's just a target as the target name was never updated to
   point to the full path to the include file output.

2. The same goes for sched.bpf.o, which is a dependency of the skel
   file.

3. The scheduler itself, which now resides in build/bin.

The first two we can fix by updating the targets to include the build
directories. The latter we'll have to fix with some more complex Make
magic, which we'll do in the subsequent commit.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 107aa2613a751..a15e5ac35a52f 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -163,19 +163,20 @@ else
 	$(Q)cp "$(VMLINUX_H)" $@
 endif
 
-%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
 	| $(BPFOBJ) $(SCXOBJ_DIR)
-	$(call msg,CLNG-BPF,,$@)
-	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $(SCXOBJ_DIR)/$@
-
-%.skel.h: %.bpf.o $(BPFTOOL)
-	$(call msg,GEN-SKEL,,$@)
-	$(Q)$(BPFTOOL) gen object $(SCXOBJ_DIR)/$(<:.o=.linked1.o) $(SCXOBJ_DIR)/$<
-	$(Q)$(BPFTOOL) gen object $(SCXOBJ_DIR)/$(<:.o=.linked2.o) $(SCXOBJ_DIR)/$(<:.o=.linked1.o)
-	$(Q)$(BPFTOOL) gen object $(SCXOBJ_DIR)/$(<:.o=.linked3.o) $(SCXOBJ_DIR)/$(<:.o=.linked2.o)
-	$(Q)diff $(SCXOBJ_DIR)/$(<:.o=.linked2.o) $(SCXOBJ_DIR)/$(<:.o=.linked3.o)
-	$(Q)$(BPFTOOL) gen skeleton $(SCXOBJ_DIR)/$(<:.o=.linked3.o) name $(<:.bpf.o=) > $(INCLUDE_DIR)/$@
-	$(Q)$(BPFTOOL) gen subskeleton $(SCXOBJ_DIR)/$(<:.o=.linked3.o) name $(<:.bpf.o=) > $(INCLUDE_DIR)/$(@:.skel.h=.subskel.h)
+	$(call msg,CLNG-BPF,,$(notdir $@))
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+$(INCLUDE_DIR)/%.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
+	$(eval sched=$(notdir $@))
+	$(call msg,GEN-SKEL,,$(sched))
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
 
 define ccsched
 	$(CC) $(CFLAGS) -c $(1) -o $(SCXOBJ_DIR)/$(2).o
@@ -183,7 +184,7 @@ define ccsched
 endef
 
 SCX_COMMON_DEPS := user_exit_info.h scx_user_common.h | $(BINDIR)
-scx_simple: scx_simple.c scx_simple.skel.h $(SCX_COMMON_DEPS)
+scx_simple: scx_simple.c $(INCLUDE_DIR)/scx_simple.skel.h $(SCX_COMMON_DEPS)
 	$(call ccsched,$<,$@)
 
 scx_qmap: scx_qmap.c scx_qmap.skel.h $(SCX_COMMON_DEPS)

From 62e23159251abb950eb51c3053def2637e4d9f1f Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 3 Nov 2023 00:15:56 -0500
Subject: [PATCH 154/304] scx: Don't rebuild schedulers unnecessarily

Now that the scheduler binaries are written to the build/bin/ directory,
Make gets confused because it doesn't see the binary file in the same
directory anymore and tries to rebuild it. This makes things kind of
tricky, because make will always execute the recipe for the target,
which is to compile it.

We could add a layer of indirection by instead having the base scheduler
target be empty, and just take a dependency on the actual binary that's
created the compiler. This patch does that, and also cleans up the build
to avoid copy-pasted scheduler recipes.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index a15e5ac35a52f..5a7f88f07e5a7 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -178,29 +178,22 @@ $(INCLUDE_DIR)/%.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOO
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $@
 	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
 
-define ccsched
-	$(CC) $(CFLAGS) -c $(1) -o $(SCXOBJ_DIR)/$(2).o
-	$(CC) -o $(BINDIR)/$(2) $(SCXOBJ_DIR)/$(2).o $(HOST_BPFOBJ) $(LDFLAGS)
-endef
-
 SCX_COMMON_DEPS := user_exit_info.h scx_user_common.h | $(BINDIR)
-scx_simple: scx_simple.c $(INCLUDE_DIR)/scx_simple.skel.h $(SCX_COMMON_DEPS)
-	$(call ccsched,$<,$@)
-
-scx_qmap: scx_qmap.c scx_qmap.skel.h $(SCX_COMMON_DEPS)
-	$(call ccsched,$<,$@)
 
-scx_central: scx_central.c scx_central.skel.h $(SCX_COMMON_DEPS)
-	$(call ccsched,$<,$@)
+################
+# C schedulers #
+################
+c-sched-targets = scx_qmap scx_simple scx_central scx_pair scx_flatcg scx_userland
 
-scx_pair: scx_pair.c scx_pair.skel.h $(SCX_COMMON_DEPS)
-	$(call ccsched,$<,$@)
-
-scx_flatcg: scx_flatcg.c scx_flatcg.skel.h $(SCX_COMMON_DEPS)
-	$(call ccsched,$<,$@)
-
-scx_userland: scx_userland.c scx_userland.skel.h scx_userland.h $(SCX_COMMON_DEPS)
-	$(call ccsched,$<,$@)
+$(addprefix $(BINDIR)/,$(c-sched-targets)): \
+	$(BINDIR)/%: \
+		$(filter-out %.bpf.c,%.c) \
+		$(INCLUDE_DIR)/%.skel.h \
+		$(SCX_COMMON_DEPS)
+	$(eval sched=$(notdir $@))
+	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
+	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
+$(c-sched-targets): %: $(BINDIR)/%
 
 # Separate build target that is available for build systems to use to fetch
 # dependencies in a separate step from building. This allows the scheduler
@@ -227,7 +220,7 @@ clean:
 	cargo clean --manifest-path=scx_rusty/Cargo.toml
 	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-	rm -f scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland
+	rm -f $(c-sched-targets)
 
 fullclean: clean
 	$(Q)$(MAKE) -sC ../../ clean
@@ -291,7 +284,7 @@ help:
 	@echo  '                    rust files for rust schedulers, and also trigger a'
 	@echo  '                    clean of the kernel at the root of the whole repository.'
 
-.PHONY: all scx_rusty clean fullclean help
+.PHONY: all $(c-sched-targets) scx_rusty clean fullclean help
 
 # delete failed targets
 .DELETE_ON_ERROR:

From 2c768434c332d376e57281ab5b8ce6f38689e963 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 3 Nov 2023 00:44:20 -0500
Subject: [PATCH 155/304] scx: Aggregate build logic for rust schedulers

scx_rusty currently defines several build targets and recipes that would
have to be duplicated by any other rust scheduler we may add. Let's add
some build scaffolding to avoid people having to copy paste.

Note that we can't fully avoid running any make logic if we take the
same approach as with the C schedulers. The C schedulers add a layer of
indirection where the "base" target (e.g. scx_simple) do nothing but
take a dependency on the binary output file. This doesn't work with rust
schedulers, because we're relying on Cargo to tell us when it needs to
be rebuilt.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile           | 35 ++++++++++++++++++------------
 tools/sched_ext/scx_rusty/build.rs |  4 ++--
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 5a7f88f07e5a7..3865ae042f8bc 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -195,29 +195,36 @@ $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
 $(c-sched-targets): %: $(BINDIR)/%
 
+
+###################
+# Rust schedulers #
+###################
+rust-sched-targets := scx_rusty
+
 # Separate build target that is available for build systems to use to fetch
 # dependencies in a separate step from building. This allows the scheduler
 # to be compiled without network access.
 #
-# If the scx_rusty Make target is invoked without CARGO_OFFLINE=1 (e.g. if
-# building locally), then cargo build will download all of the necessary
-# dependencies, and scx_rusty_deps can be skipped.
-scx_rusty_deps:
-	cargo fetch --manifest-path=scx_rusty/Cargo.toml
-
-scx_rusty: export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)
-scx_rusty: export SCX_RUSTY_CLANG = $(CLANG)
-scx_rusty: export SCX_RUSTY_BPF_CFLAGS = $(BPF_CFLAGS)
-scx_rusty: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
-	cargo build --manifest-path=$@/Cargo.toml $(CARGOFLAGS)
-	$(Q)cp $(OUTPUT_DIR)/release/$@ $(BINDIR)/$@
+# If the regular rust scheduler Make target (e.g. scx_rusty) is invoked without
+# CARGO_OFFLINE=1 (e.g. if building locally), then cargo build will download
+# all of the necessary dependencies, and the deps target can be skipped.
+$(addsuffix _deps,$(rust-sched-targets)):
+	$(Q)cargo fetch --manifest-path=scx_rusty/Cargo.toml
+
+$(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
+	$(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR))
+	$(eval export SCX_RUST_CLANG = $(CLANG))
+	$(eval export SCX_RUST_BPF_CFLAGS= $(BPF_CFLAGS))
+	$(eval sched=$(notdir $@))
+	$(Q)cargo build --manifest-path=$(sched)/Cargo.toml $(CARGOFLAGS)
+	$(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@
 
 install: all
 	$(Q)mkdir -p $(DESTDIR)/usr/bin/
 	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/bin/
 
 clean:
-	cargo clean --manifest-path=scx_rusty/Cargo.toml
+	$(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;)
 	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f $(c-sched-targets)
@@ -284,7 +291,7 @@ help:
 	@echo  '                    rust files for rust schedulers, and also trigger a'
 	@echo  '                    clean of the kernel at the root of the whole repository.'
 
-.PHONY: all $(c-sched-targets) scx_rusty clean fullclean help
+.PHONY: all $(c-sched-targets) $(rust-sched-targets) clean fullclean help
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index 2385e7e6f040f..c54b8f33c5778 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -40,8 +40,8 @@ fn bindgen_rusty() {
 }
 
 fn gen_bpf_sched(name: &str) {
-    let bpf_cflags = env::var("SCX_RUSTY_BPF_CFLAGS").unwrap();
-    let clang = env::var("SCX_RUSTY_CLANG").unwrap();
+    let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
+    let clang = env::var("SCX_RUST_CLANG").unwrap();
     eprintln!("{}", clang);
     let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
     let skel = Path::new(&outpath);

From b94df28c9baedf267d432e14f85eeb9c32c6fa66 Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Fri, 3 Nov 2023 09:11:26 +0100
Subject: [PATCH 156/304] bpftool: Fix prog object type in manpage

bpftool's man page lists "program" as one of possible values for OBJECT,
while in fact bpftool accepts "prog" instead.

Reported-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20231103081126.170034-1-asavkov@redhat.com
---
 tools/bpf/bpftool/Documentation/bpftool.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 6965c94dfdafe..09e4f2ff5658b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -20,7 +20,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **link** | **cgroup** | **perf** | **net** | **feature** |
+	*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** |
 	**btf** | **gen** | **struct_ops** | **iter** }
 
 	*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| }

From 58e2a66edb3f9c87abc236853fa94523177792cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 09:12:06 -1000
Subject: [PATCH 157/304] scx_common: libbpf now has inline iter decls, drop
 them from scx_common.bpf.h

---
 tools/sched_ext/scx_common.bpf.h | 104 -------------------------------
 1 file changed, 104 deletions(-)

diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 81bfe3d041c9a..38168981fd0b7 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -235,108 +235,4 @@ u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
 
-/* BPF core iterators from tools/testing/selftests/bpf/progs/bpf_misc.h */
-struct bpf_iter_num;
-
-extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __ksym;
-extern int *bpf_iter_num_next(struct bpf_iter_num *it) __ksym;
-extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __ksym;
-
-#ifndef bpf_for_each
-/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
- * using BPF open-coded iterators without having to write mundane explicit
- * low-level loop logic. Instead, it provides for()-like generic construct
- * that can be used pretty naturally. E.g., for some hypothetical cgroup
- * iterator, you'd write:
- *
- * struct cgroup *cg, *parent_cg = <...>;
- *
- * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) {
- *     bpf_printk("Child cgroup id = %d", cg->cgroup_id);
- *     if (cg->cgroup_id == 123)
- *         break;
- * }
- *
- * I.e., it looks almost like high-level for each loop in other languages,
- * supports continue/break, and is verifiable by BPF verifier.
- *
- * For iterating integers, the difference betwen bpf_for_each(num, i, N, M)
- * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to
- * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int
- * *`, not just `int`. So for integers bpf_for() is more convenient.
- *
- * Note: this macro relies on C99 feature of allowing to declare variables
- * inside for() loop, bound to for() loop lifetime. It also utilizes GCC
- * extension: __attribute__((cleanup(<func>))), supported by both GCC and
- * Clang.
- */
-#define bpf_for_each(type, cur, args...) for (							\
-	/* initialize and define destructor */							\
-	struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */,	\
-						    cleanup(bpf_iter_##type##_destroy))),	\
-	/* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */		\
-			       *___p __attribute__((unused)) = (				\
-					bpf_iter_##type##_new(&___it, ##args),			\
-	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
-	/* for bpf_iter_##type##_destroy() when used from cleanup() attribute */		\
-					(void)bpf_iter_##type##_destroy, (void *)0);		\
-	/* iteration and termination check */							\
-	(((cur) = bpf_iter_##type##_next(&___it)));						\
-)
-#endif /* bpf_for_each */
-
-#ifndef bpf_for
-/* bpf_for(i, start, end) implements a for()-like looping construct that sets
- * provided integer variable *i* to values starting from *start* through,
- * but not including, *end*. It also proves to BPF verifier that *i* belongs
- * to range [start, end), so this can be used for accessing arrays without
- * extra checks.
- *
- * Note: *start* and *end* are assumed to be expressions with no side effects
- * and whose values do not change throughout bpf_for() loop execution. They do
- * not have to be statically known or constant, though.
- *
- * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
- * loop bound variables and cleanup attribute, supported by GCC and Clang.
- */
-#define bpf_for(i, start, end) for (								\
-	/* initialize and define destructor */							\
-	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
-						 cleanup(bpf_iter_num_destroy))),		\
-	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
-			    *___p __attribute__((unused)) = (					\
-				bpf_iter_num_new(&___it, (start), (end)),			\
-	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
-	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
-				(void)bpf_iter_num_destroy, (void *)0);				\
-	({											\
-		/* iteration step */								\
-		int *___t = bpf_iter_num_next(&___it);						\
-		/* termination and bounds check */						\
-		(___t && ((i) = *___t, (i) >= (start) && (i) < (end)));				\
-	});											\
-)
-#endif /* bpf_for */
-
-#ifndef bpf_repeat
-/* bpf_repeat(N) performs N iterations without exposing iteration number
- *
- * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
- * loop bound variables and cleanup attribute, supported by GCC and Clang.
- */
-#define bpf_repeat(N) for (									\
-	/* initialize and define destructor */							\
-	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
-						 cleanup(bpf_iter_num_destroy))),		\
-	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
-			    *___p __attribute__((unused)) = (					\
-				bpf_iter_num_new(&___it, 0, (N)),				\
-	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
-	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
-				(void)bpf_iter_num_destroy, (void *)0);				\
-	bpf_iter_num_next(&___it);								\
-	/* nothing here  */									\
-)
-#endif /* bpf_repeat */
-
 #endif	/* __SCHED_EXT_COMMON_BPF_H */

From a4fbd6fc5a4d4574907de9c4f87be6e0be1f90c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 08:55:31 -1000
Subject: [PATCH 158/304] scx_rusty: ravg WIP

---
 tools/sched_ext/ravg_impl.bpf.h               |  24 ++--
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c |  83 +++++++------
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |   5 +-
 tools/sched_ext/scx_rusty/src/main.rs         | 117 ++++++++++++++++--
 4 files changed, 174 insertions(+), 55 deletions(-)

diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h
index e2a73a93aa06a..3b254855220be 100644
--- a/tools/sched_ext/ravg_impl.bpf.h
+++ b/tools/sched_ext/ravg_impl.bpf.h
@@ -121,15 +121,13 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
 		if (seq_delta > 1) {
 			__u32 idx = seq_delta - 2;
 
-			if (idx < ravg_full_sum_len)
-				ravg_add(&rd->old, rd->val *
-					 ravg_full_sum[idx]);
-			else
-				ravg_add(&rd->old, rd->val *
-					 ravg_full_sum[ravg_full_sum_len - 2]);
+			if (idx >= ravg_full_sum_len)
+				idx = ravg_full_sum_len - 1;
+
+			ravg_add(&rd->old, rd->val * ravg_full_sum[idx]);
 		}
 
-		/* accumulate the current period duration into ->runtime */
+		/* accumulate the current period duration into ->cur */
 		rd->cur += rd->val * ravg_normalize_dur(now % half_life,
 							half_life);
 	} else {
@@ -253,7 +251,17 @@ static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
 				     __u64 half_life)
 {
 	struct ravg_data trd;
-	__u32 elapsed = now % half_life;
+	__u32 elapsed;
+
+	/*
+	 * It may be difficult for the caller to guarantee monotonic progress if
+	 * multiple CPUs accumulate to the same ravg_data. Handle @now being in
+	 * the past of @rd->val_at.
+	 */
+	if (now < rd->val_at)
+		now = rd->val_at;
+
+	elapsed = now % half_life;
 
 	/*
 	 * Accumulate the ongoing period into a temporary copy. This allows
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 2261ee75f5233..f7967f35ef6b9 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -96,54 +96,59 @@ struct {
 	__type(value, struct dom_ctx);
 	__uint(max_entries, MAX_DOMS);
 	__uint(map_flags, 0);
-} dom_ctx SEC(".maps");
+} dom_data SEC(".maps");
 
-struct dom_load {
+struct lock_wrapper {
 	struct bpf_spin_lock lock;
-	u64 load;
-	struct ravg_data load_rd;
 };
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
-	__type(value, struct dom_load);
+	__type(value, struct lock_wrapper);
 	__uint(max_entries, MAX_DOMS);
 	__uint(map_flags, 0);
-} dom_load SEC(".maps");
+} dom_load_locks SEC(".maps");
 
 const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
 
 static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 {
-	struct dom_load *load;
+	struct dom_ctx *domc;
+	struct lock_wrapper *lockw;
+
+	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
+	lockw = bpf_map_lookup_elem(&dom_load_locks, &dom_id);
 
-	if (!(load = bpf_map_lookup_elem(&dom_load, &dom_id))) {
-		scx_bpf_error("no dom_load for dom %u", dom_id);
+	if (!domc || !lockw) {
+		scx_bpf_error("dom_ctx / lock lookup failed");
 		return;
 	}
 
-	bpf_spin_lock(&load->lock);
-	load->load += adj;
-	ravg_accumulate(&load->load_rd, load->load, now, USAGE_HALF_LIFE);
-	bpf_spin_unlock(&load->lock);
+	bpf_spin_lock(&lockw->lock);
+	domc->load += adj;
+	ravg_accumulate(&domc->load_rd, domc->load, now, USAGE_HALF_LIFE);
+	bpf_spin_unlock(&lockw->lock);
 
-	if (adj < 0 && (s64)load->load < 0)
+	if (adj < 0 && (s64)domc->load < 0)
 		scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)",
-			      bpf_get_smp_processor_id(), dom_id, load->load, adj);
+			      bpf_get_smp_processor_id(), dom_id, domc->load, adj);
 }
 
 static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 			       u32 from_dom_id, u32 to_dom_id, u64 now)
 {
-	struct dom_load *from_load, *to_load;
+	struct dom_ctx *from_domc, *to_domc;
+	struct lock_wrapper *from_lockw, *to_lockw;
 	struct ravg_data task_load_rd;
 	int ret;
 
-	from_load = bpf_map_lookup_elem(&dom_load, &from_dom_id);
-	to_load = bpf_map_lookup_elem(&dom_load, &to_dom_id);
-	if (!from_load || !to_load) {
-		scx_bpf_error("dom_load lookup failed");
+	from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
+	from_lockw = bpf_map_lookup_elem(&dom_load_locks, &from_dom_id);
+	to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id);
+	to_lockw = bpf_map_lookup_elem(&dom_load_locks, &to_dom_id);
+	if (!from_domc || !from_lockw || !to_domc || !to_lockw) {
+		scx_bpf_error("dom_ctx / lock lookup failed");
 		return;
 	}
 
@@ -157,22 +162,22 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	ravg_scale(&task_load_rd, p->scx.weight, 0);
 
 	/* transfer out of @from_dom_id */
-	bpf_spin_lock(&from_load->lock);
+	bpf_spin_lock(&from_lockw->lock);
 	if (taskc->runnable)
-		from_load->load -= p->scx.weight;
-	ravg_accumulate(&from_load->load_rd, from_load->load, now, USAGE_HALF_LIFE);
-	ret = ravg_transfer(&from_load->load_rd, &task_load_rd, false);
-	bpf_spin_unlock(&from_load->lock);
+		from_domc->load -= p->scx.weight;
+	ravg_accumulate(&from_domc->load_rd, from_domc->load, now, USAGE_HALF_LIFE);
+	ret = ravg_transfer(&from_domc->load_rd, &task_load_rd, false);
+	bpf_spin_unlock(&from_lockw->lock);
 	if (ret < 0)
 		scx_bpf_error("Failed to transfer out load");
 
 	/* transfer into @to_dom_id */
-	bpf_spin_lock(&to_load->lock);
+	bpf_spin_lock(&to_lockw->lock);
 	if (taskc->runnable)
-		to_load->load += p->scx.weight;
-	ravg_accumulate(&to_load->load_rd, to_load->load, now, USAGE_HALF_LIFE);
-	ret = ravg_transfer(&to_load->load_rd, &task_load_rd, true);
-	bpf_spin_unlock(&to_load->lock);
+		to_domc->load += p->scx.weight;
+	ravg_accumulate(&to_domc->load_rd, to_domc->load, now, USAGE_HALF_LIFE);
+	ret = ravg_transfer(&to_domc->load_rd, &task_load_rd, true);
+	bpf_spin_unlock(&to_lockw->lock);
 	if (ret < 0)
 		scx_bpf_error("Failed to transfer in load");
 }
@@ -278,7 +283,7 @@ static void refresh_tune_params(void)
 		u32 dom_id = cpu_to_dom_id(cpu);
 		struct dom_ctx *domc;
 
-		if (!(domc = bpf_map_lookup_elem(&dom_ctx, &dom_id))) {
+		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
 			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 			return;
 		}
@@ -313,7 +318,7 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 	u32 old_dom_id = taskc->dom_id;
 	s64 vtime_delta;
 
-	old_domc = bpf_map_lookup_elem(&dom_ctx, &old_dom_id);
+	old_domc = bpf_map_lookup_elem(&dom_data, &old_dom_id);
 	if (!old_domc) {
 		scx_bpf_error("Failed to lookup old dom%u", old_dom_id);
 		return false;
@@ -324,7 +329,7 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
 	else
 		vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now;
 
-	new_domc = bpf_map_lookup_elem(&dom_ctx, &new_dom_id);
+	new_domc = bpf_map_lookup_elem(&dom_data, &new_dom_id);
 	if (!new_domc) {
 		scx_bpf_error("Failed to lookup new dom%u", new_dom_id);
 		return false;
@@ -397,7 +402,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 			const struct cpumask *idle_cpumask;
 			bool has_idle;
 
-			domc = bpf_map_lookup_elem(&dom_ctx, &taskc->dom_id);
+			domc = bpf_map_lookup_elem(&dom_data, &taskc->dom_id);
 			if (!domc) {
 				scx_bpf_error("Failed to find dom%u", taskc->dom_id);
 				goto enoent;
@@ -511,7 +516,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		u32 dom_id = cpu_to_dom_id(prev_cpu);
 		struct dom_ctx *domc;
 
-		if (!(domc = bpf_map_lookup_elem(&dom_ctx, &dom_id))) {
+		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
 			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 			goto enoent;
 		}
@@ -642,7 +647,7 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
 		u32 dom_id = taskc->dom_id;
 		struct dom_ctx *domc;
 
-		domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
+		domc = bpf_map_lookup_elem(&dom_data, &dom_id);
 		if (!domc) {
 			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 			return;
@@ -770,7 +775,7 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 	taskc->running_at = bpf_ktime_get_ns();
 
 	dom_id = taskc->dom_id;
-	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
+	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
 	if (!domc) {
 		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 		return;
@@ -952,13 +957,13 @@ static s32 create_dom(u32 dom_id)
 		return ret;
 	}
 
-	ret = bpf_map_update_elem(&dom_ctx, &dom_id, &domc_init, 0);
+	ret = bpf_map_update_elem(&dom_data, &dom_id, &domc_init, 0);
 	if (ret) {
 		scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret);
 		return ret;
 	}
 
-	domc = bpf_map_lookup_elem(&dom_ctx, &dom_id);
+	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
 	if (!domc) {
 		/* Should never happen, we just inserted it above. */
 		scx_bpf_error("No dom%u", dom_id);
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index e8b6e5cba0a0c..a70e2488f78c4 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -73,9 +73,12 @@ struct task_ctx {
 };
 
 struct dom_ctx {
+	u64 vtime_now;
 	struct bpf_cpumask __kptr *cpumask;
 	struct bpf_cpumask __kptr *direct_greedy_cpumask;
-	u64 vtime_now;
+
+	u64 load;
+	struct ravg_data load_rd;
 };
 
 #endif /* __RUSTY_H */
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 4ff57913aa1d7..0de9f0277af28 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -134,6 +134,77 @@ struct Opts {
     verbose: u8,
 }
 
+fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u64) -> f64 {
+    const RAVG_1: f64 = (1 << rusty_sys::ravg_consts_RAVG_FRAC_BITS) as f64;
+    let val = rd.val as f64 / RAVG_1;
+    let val_at = rd.val_at;
+    let mut old = rd.old as f64 / RAVG_1;
+    let mut cur = rd.cur as f64 / RAVG_1;
+
+    let now = now.max(val_at);
+    let normalized_dur = |dur| dur as f64 / half_life as f64;
+
+    //
+    // The following is f64 implementation of BPF ravg_accumulate().
+    //
+    let cur_seq = (now / half_life) as i64;
+    let val_seq = (val_at / half_life) as i64;
+    let seq_delta = (cur_seq - val_seq) as i32;
+
+    if seq_delta > 0 {
+        let full_decay = 2f64.powi(seq_delta);
+
+        // Decay $old and fold $cur into it.
+        old /= full_decay;
+        old += cur / full_decay;
+        cur = 0.0;
+
+        // Fold the oldest period whicy may be partial.
+        old += val * normalized_dur(half_life - val_at % half_life) / full_decay;
+
+        /* pre-computed decayed full-period values */
+        const FULL_SUMS: [f64; 20] = [
+            0.5,
+            0.75,
+            0.875,
+            0.9375,
+            0.96875,
+            0.984375,
+            0.9921875,
+            0.99609375,
+            0.998046875,
+            0.9990234375,
+            0.99951171875,
+            0.999755859375,
+            0.9998779296875,
+            0.99993896484375,
+            0.999969482421875,
+            0.9999847412109375,
+            0.9999923706054688,
+            0.9999961853027344,
+            0.9999980926513672,
+            0.9999990463256836,
+            /* use the same value beyond this point */
+        ];
+
+        // Fold the full periods in the middle.
+        if seq_delta >= 2 {
+            let idx = ((seq_delta - 2) as usize).min(FULL_SUMS.len() - 1);
+            old += val * FULL_SUMS[idx];
+        }
+
+        // Accumulate the current period duration into @cur.
+        cur += val * normalized_dur(now % half_life);
+    } else {
+        cur += val * normalized_dur(now - val_at);
+    }
+
+    //
+    // The following is the blending part of BPF ravg_read().
+    //
+    old * (1.0 - normalized_dur(now % half_life) / 2.0) + cur / 2.0
+}
+
 fn now_monotonic() -> u64 {
     let mut time = libc::timespec {
         tv_sec: 0,
@@ -348,8 +419,7 @@ impl Topology {
         }
 
         // Build and return dom -> cpumask and cpu -> dom mappings.
-        let mut dom_cpus =
-            vec![bitvec![u64, Lsb0; 0; rusty_sys::MAX_CPUS as usize]; nr_doms];
+        let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; rusty_sys::MAX_CPUS as usize]; nr_doms];
         let mut cpu_dom = vec![];
 
         for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) {
@@ -633,6 +703,32 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         Ok(())
     }
 
+    fn read_dom_loads(&mut self) -> Result<()> {
+        let now_mono = now_monotonic();
+        let dom_data = self.maps.dom_data();
+
+        for i in 0..self.top.nr_doms {
+	    let key = unsafe { std::mem::transmute::<u32, [u8; 4]>(i as u32) };
+
+            if let Some(dom_ctx_map_elem) = dom_data
+                .lookup(&key, libbpf_rs::MapFlags::ANY)
+                .context("Failed to lookup dom_ctx")?
+            {
+                let dom_ctx = unsafe {
+                    &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx)
+                };
+
+                self.dom_loads[i] = ravg_read(
+                    &dom_ctx.load_rd,
+                    now_mono,
+                    rusty_sys::USAGE_HALF_LIFE as u64,
+                );
+            }
+        }
+
+        Ok(())
+    }
+
     // To balance dom loads we identify doms with lower and higher load than average
     fn calculate_dom_load_balance(&mut self) -> Result<()> {
         for (dom, dom_load) in self.dom_loads.iter().enumerate() {
@@ -1012,11 +1108,7 @@ impl<'a> Scheduler<'a> {
                 })
                 .sum();
             stats_map
-                .update_percpu(
-                    &stat.to_ne_bytes(),
-                    &zero_vec,
-                    libbpf_rs::MapFlags::ANY,
-                )
+                .update_percpu(&stat.to_ne_bytes(), &zero_vec, libbpf_rs::MapFlags::ANY)
                 .context("Failed to zero stat")?;
             stats.push(sum);
         }
@@ -1119,6 +1211,17 @@ impl<'a> Scheduler<'a> {
         );
 
         lb.read_task_loads(started_at.duration_since(self.prev_at))?;
+
+        let dom_loads_from_task_loads = lb.dom_loads.clone();
+        lb.dom_loads = vec![0f64; self.top.nr_doms];
+        lb.read_dom_loads()?;
+        for i in 0..self.top.nr_doms {
+            info!(
+                "dom{} = {:.2} {:.2}",
+                i, dom_loads_from_task_loads[i], lb.dom_loads[i]
+            );
+        }
+
         lb.calculate_dom_load_balance()?;
 
         if self.balance_load {

From b24bc9b40cb293c95e32a5d8c56ffc3a048ff659 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 12:52:06 -1000
Subject: [PATCH 159/304] scx_rusty: Switch to ravg dom loads

---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 39 +++++++++++++++++++
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |  1 +
 tools/sched_ext/scx_rusty/src/main.rs         | 27 +++++--------
 3 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index f7967f35ef6b9..84b8f2d1da1d4 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -65,6 +65,7 @@ const volatile bool kthreads_local;
 const volatile bool fifo_sched;
 const volatile bool switch_partial;
 const volatile u32 greedy_threshold;
+const volatile u32 debug;
 
 /* base slice duration */
 const volatile u64 slice_ns = SCX_SLICE_DFL;
@@ -133,6 +134,15 @@ static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 	if (adj < 0 && (s64)domc->load < 0)
 		scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)",
 			      bpf_get_smp_processor_id(), dom_id, domc->load, adj);
+
+	if (debug >=2 &&
+	    (!domc->dbg_load_printed_at || now - domc->dbg_load_printed_at >= 1000000000)) {
+		bpf_printk("LOAD ADJ dom=%u adj=%lld load=%llu",
+			   dom_id,
+			   adj,
+			   ravg_read(&domc->load_rd, now, USAGE_HALF_LIFE) >> RAVG_FRAC_BITS);
+		domc->dbg_load_printed_at = now;
+	}
 }
 
 static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
@@ -141,6 +151,7 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	struct dom_ctx *from_domc, *to_domc;
 	struct lock_wrapper *from_lockw, *to_lockw;
 	struct ravg_data task_load_rd;
+	u64 from_load[2], to_load[2], task_load;
 	int ret;
 
 	from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
@@ -161,12 +172,23 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	task_load_rd = taskc->dcyc_rd;
 	ravg_scale(&task_load_rd, p->scx.weight, 0);
 
+	if (debug >= 2)
+		task_load = ravg_read(&task_load_rd, now, USAGE_HALF_LIFE);
+
 	/* transfer out of @from_dom_id */
 	bpf_spin_lock(&from_lockw->lock);
 	if (taskc->runnable)
 		from_domc->load -= p->scx.weight;
 	ravg_accumulate(&from_domc->load_rd, from_domc->load, now, USAGE_HALF_LIFE);
+
+	if (debug >= 2)
+		from_load[0] = ravg_read(&from_domc->load_rd, now, USAGE_HALF_LIFE);
+
 	ret = ravg_transfer(&from_domc->load_rd, &task_load_rd, false);
+
+	if (debug >= 2)
+		from_load[1] = ravg_read(&from_domc->load_rd, now, USAGE_HALF_LIFE);
+
 	bpf_spin_unlock(&from_lockw->lock);
 	if (ret < 0)
 		scx_bpf_error("Failed to transfer out load");
@@ -176,10 +198,27 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	if (taskc->runnable)
 		to_domc->load += p->scx.weight;
 	ravg_accumulate(&to_domc->load_rd, to_domc->load, now, USAGE_HALF_LIFE);
+
+	if (debug >= 2)
+		to_load[0] = ravg_read(&to_domc->load_rd, now, USAGE_HALF_LIFE);
+
 	ret = ravg_transfer(&to_domc->load_rd, &task_load_rd, true);
+
+	if (debug >= 2)
+		to_load[1] = ravg_read(&to_domc->load_rd, now, USAGE_HALF_LIFE);
+
 	bpf_spin_unlock(&to_lockw->lock);
 	if (ret < 0)
 		scx_bpf_error("Failed to transfer in load");
+
+	if (debug >= 2)
+		bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
+			   from_dom_id, to_dom_id,
+			   task_load >> RAVG_FRAC_BITS,
+			   from_load[0] >> RAVG_FRAC_BITS,
+			   from_load[1] >> RAVG_FRAC_BITS,
+			   to_load[0] >> RAVG_FRAC_BITS,
+			   to_load[1] >> RAVG_FRAC_BITS);
 }
 
 /*
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index a70e2488f78c4..ace3fb8c31c43 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -79,6 +79,7 @@ struct dom_ctx {
 
 	u64 load;
 	struct ravg_data load_rd;
+	u64 dbg_load_printed_at;
 };
 
 #endif /* __RUSTY_H */
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 0de9f0277af28..3e314a24549a1 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -136,7 +136,7 @@ struct Opts {
 
 fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u64) -> f64 {
     const RAVG_1: f64 = (1 << rusty_sys::ravg_consts_RAVG_FRAC_BITS) as f64;
-    let val = rd.val as f64 / RAVG_1;
+    let val = rd.val as f64;
     let val_at = rd.val_at;
     let mut old = rd.old as f64 / RAVG_1;
     let mut cur = rd.cur as f64 / RAVG_1;
@@ -621,8 +621,6 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         let now_mono = now_monotonic();
         let task_data = self.maps.task_data();
         let mut this_task_loads = BTreeMap::<i32, TaskLoad>::new();
-        let mut load_sum = 0.0f64;
-        self.dom_loads = vec![0f64; self.top.nr_doms];
 
         for key in task_data.keys() {
             if let Some(task_ctx_vec) = task_data
@@ -680,8 +678,6 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     },
                 );
 
-                load_sum += this_load;
-                self.dom_loads[task_ctx.dom_id as usize] += this_load;
                 // Only record pids that are eligible for load balancing
                 if task_ctx.dom_mask == (1u64 << task_ctx.dom_id) {
                     continue;
@@ -698,7 +694,6 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
             }
         }
 
-        self.load_avg = load_sum / self.top.nr_doms as f64;
         *self.task_loads = this_task_loads;
         Ok(())
     }
@@ -706,9 +701,10 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     fn read_dom_loads(&mut self) -> Result<()> {
         let now_mono = now_monotonic();
         let dom_data = self.maps.dom_data();
+	let mut load_sum = 0.0f64;
 
         for i in 0..self.top.nr_doms {
-	    let key = unsafe { std::mem::transmute::<u32, [u8; 4]>(i as u32) };
+            let key = unsafe { std::mem::transmute::<u32, [u8; 4]>(i as u32) };
 
             if let Some(dom_ctx_map_elem) = dom_data
                 .lookup(&key, libbpf_rs::MapFlags::ANY)
@@ -723,9 +719,13 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     now_mono,
                     rusty_sys::USAGE_HALF_LIFE as u64,
                 );
+
+		load_sum += self.dom_loads[i];
             }
         }
 
+        self.load_avg = load_sum / self.top.nr_doms as f64;
+
         Ok(())
     }
 
@@ -995,6 +995,7 @@ impl<'a> Scheduler<'a> {
         skel.rodata().fifo_sched = opts.fifo_sched;
         skel.rodata().switch_partial = opts.partial;
         skel.rodata().greedy_threshold = opts.greedy_threshold;
+        skel.rodata().debug = opts.verbose as u32;
 
         // Attach.
         let mut skel = skel.load().context("Failed to load BPF program")?;
@@ -1210,18 +1211,8 @@ impl<'a> Scheduler<'a> {
             &mut self.nr_lb_data_errors,
         );
 
-        lb.read_task_loads(started_at.duration_since(self.prev_at))?;
-
-        let dom_loads_from_task_loads = lb.dom_loads.clone();
-        lb.dom_loads = vec![0f64; self.top.nr_doms];
         lb.read_dom_loads()?;
-        for i in 0..self.top.nr_doms {
-            info!(
-                "dom{} = {:.2} {:.2}",
-                i, dom_loads_from_task_loads[i], lb.dom_loads[i]
-            );
-        }
-
+        lb.read_task_loads(started_at.duration_since(self.prev_at))?;
         lb.calculate_dom_load_balance()?;
 
         if self.balance_load {

From d401cf14ae608966ca469f44fc369025a32b02a9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 12:56:21 -1000
Subject: [PATCH 160/304] scx_rusty: Drop unnnecessary read_volatiles from map
 elem access

---
 tools/sched_ext/scx_rusty/src/main.rs | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 3e314a24549a1..4bf49c9c2bc3c 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -635,13 +635,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                         .context("Invalid key length in task_data map")?,
                 );
 
-                let (this_at, this_for, weight) = unsafe {
-                    (
-                        std::ptr::read_volatile(&task_ctx.runnable_at as *const u64),
-                        std::ptr::read_volatile(&task_ctx.runnable_for as *const u64),
-                        std::ptr::read_volatile(&task_ctx.weight as *const u32),
-                    )
-                };
+                let (this_at, this_for, weight) =
+                    (task_ctx.runnable_at, task_ctx.runnable_for, task_ctx.weight);
 
                 let (mut delta, prev_load) = match self.task_loads.get(&pid) {
                     Some(prev) => (this_for - prev.runnable_for, Some(prev.load)),
@@ -701,7 +696,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     fn read_dom_loads(&mut self) -> Result<()> {
         let now_mono = now_monotonic();
         let dom_data = self.maps.dom_data();
-	let mut load_sum = 0.0f64;
+        let mut load_sum = 0.0f64;
 
         for i in 0..self.top.nr_doms {
             let key = unsafe { std::mem::transmute::<u32, [u8; 4]>(i as u32) };
@@ -720,7 +715,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     rusty_sys::USAGE_HALF_LIFE as u64,
                 );
 
-		load_sum += self.dom_loads[i];
+                load_sum += self.dom_loads[i];
             }
         }
 

From fbf0ccfe0b6bc379319cf43639891b373eec76d0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 13:10:07 -1000
Subject: [PATCH 161/304] scx_rusty: Elide reading task loads if LB isn't
 necessary

---
 tools/sched_ext/scx_rusty/src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 4bf49c9c2bc3c..f2076b20674a1 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -1207,10 +1207,10 @@ impl<'a> Scheduler<'a> {
         );
 
         lb.read_dom_loads()?;
-        lb.read_task_loads(started_at.duration_since(self.prev_at))?;
         lb.calculate_dom_load_balance()?;
 
-        if self.balance_load {
+        if self.balance_load && lb.doms_to_push.len() > 0 {
+            lb.read_task_loads(started_at.duration_since(self.prev_at))?;
             lb.load_balance()?;
         }
 

From 8895ddd02fbf346207e5337abbe17a2b7068e3a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 23:36:30 -1000
Subject: [PATCH 162/304] rusty: Fully switch to ravg

---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c |  52 ++++-
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     |   4 +-
 tools/sched_ext/scx_rusty/src/main.rs         | 208 ++++++++----------
 3 files changed, 130 insertions(+), 134 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 84b8f2d1da1d4..9e28d9d9bf13d 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -111,6 +111,15 @@ struct {
 	__uint(map_flags, 0);
 } dom_load_locks SEC(".maps");
 
+struct dom_active_pids {
+	u64 gen;
+	u64 read_idx;
+	u64 write_idx;
+	s32 pids[MAX_DOM_ACTIVE_PIDS];
+};
+
+struct dom_active_pids dom_active_pids[MAX_DOMS];
+
 const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
 
 static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
@@ -792,7 +801,6 @@ void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 		return;
 
 	taskc->runnable = true;
-	taskc->runnable_at = bpf_ktime_get_ns();
 	taskc->is_kworker = p->flags & PF_WQ_WORKER;
 
 	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
@@ -803,17 +811,42 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 {
 	struct task_ctx *taskc;
 	struct dom_ctx *domc;
-	u32 dom_id;
-
-	if (fifo_sched)
-		return;
+	u32 dom_id, dap_gen;
 
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
 	taskc->running_at = bpf_ktime_get_ns();
-
 	dom_id = taskc->dom_id;
+	if (dom_id >= MAX_DOMS) {
+		scx_bpf_error("Invalid dom ID");
+		return;
+	}
+
+	/*
+	 * Record that @p has been active in @domc. Load balancer will only
+	 * consider recently active tasks. Access synchronization rules aren't
+	 * strict. We just need to be right most of the time.
+	 */
+	dap_gen = dom_active_pids[dom_id].gen;
+	if (taskc->dom_active_pids_gen != dap_gen) {
+		u64 idx = __sync_fetch_and_add(&dom_active_pids[dom_id].write_idx, 1) %
+			MAX_DOM_ACTIVE_PIDS;
+		u32 *pidp;
+
+		pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]);
+		if (!pidp) {
+			scx_bpf_error("dom_active_pids[%u][%u] indexing failed", dom_id, idx);
+			return;
+		}
+
+		*pidp = p->pid;
+		taskc->dom_active_pids_gen = dap_gen;
+	}
+
+	if (fifo_sched)
+		return;
+
 	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
 	if (!domc) {
 		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
@@ -854,8 +887,6 @@ void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 		return;
 
 	taskc->runnable = false;
-	taskc->runnable_for += now - taskc->runnable_at;
-	taskc->runnable_at = 0;
 
 	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
 	dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
@@ -932,12 +963,11 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
 	struct bpf_cpumask *cpumask;
-	struct task_ctx taskc, *map_value;
+	struct task_ctx taskc = { .dom_active_pids_gen = -1 };
+	struct task_ctx *map_value;
 	long ret;
 	pid_t pid;
 
-	memset(&taskc, 0, sizeof(taskc));
-
 	pid = p->pid;
 	ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST);
 	if (ret) {
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index ace3fb8c31c43..d6722e8ebebbc 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -23,6 +23,7 @@ typedef unsigned long long u64;
 #define	MAX_DOMS	64		/* limited to avoid complex bitmask ops */
 #define	CACHELINE_SIZE	64
 #define USAGE_HALF_LIFE	1000000000	/* 1s */
+#define MAX_DOM_ACTIVE_PIDS 1024	/* LB looks at the latest 1k active tasks per dom */
 
 /* Statistics */
 enum stat_idx {
@@ -56,9 +57,8 @@ struct task_ctx {
 	u32 dom_id;
 	u32 weight;
 	bool runnable;
-	u64 runnable_at;
+	u64 dom_active_pids_gen;
 	u64 running_at;
-	u64 runnable_for;
 
 	/* The task is a workqueue worker thread */
 	bool is_kworker;
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index f2076b20674a1..87f2cc1cc4ccf 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -83,14 +83,6 @@ struct Opts {
     #[clap(short = 'g', long, default_value = "1")]
     greedy_threshold: u32,
 
-    /// The load decay factor. Every interval, the existing load is decayed
-    /// by this factor and new load is added. Must be in the range [0.0,
-    /// 0.99]. The smaller the value, the more sensitive load calculation
-    /// is to recent changes. When 0.0, history is ignored and the load
-    /// value from the latest period is used directly.
-    #[clap(long, default_value = "0.5")]
-    load_decay_factor: f64,
-
     /// Disable load balancing. Unless disabled, periodically userspace will
     /// calculate the load factor of each domain and instruct BPF which
     /// processes to move.
@@ -162,7 +154,7 @@ fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u64) -> f64 {
         // Fold the oldest period whicy may be partial.
         old += val * normalized_dur(half_life - val_at % half_life) / full_decay;
 
-        /* pre-computed decayed full-period values */
+        // Pre-computed decayed full-period values.
         const FULL_SUMS: [f64; 20] = [
             0.5,
             0.75,
@@ -184,7 +176,7 @@ fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u64) -> f64 {
             0.9999961853027344,
             0.9999980926513672,
             0.9999990463256836,
-            /* use the same value beyond this point */
+            // Use the same value beyond this point.
         ];
 
         // Fold the full periods in the middle.
@@ -538,12 +530,6 @@ impl Tuner {
     }
 }
 
-#[derive(Debug)]
-struct TaskLoad {
-    runnable_for: u64,
-    load: f64,
-}
-
 #[derive(Debug)]
 struct TaskInfo {
     pid: i32,
@@ -553,13 +539,11 @@ struct TaskInfo {
 }
 
 struct LoadBalancer<'a, 'b, 'c> {
-    maps: RustyMapsMut<'a>,
+    skel: &'a mut RustySkel<'b>,
     top: Arc<Topology>,
-    task_loads: &'b mut BTreeMap<i32, TaskLoad>,
-    load_decay_factor: f64,
     skip_kworkers: bool,
 
-    tasks_by_load: Vec<BTreeMap<OrderedFloat<f64>, TaskInfo>>,
+    tasks_by_load: Vec<Option<BTreeMap<OrderedFloat<f64>, TaskInfo>>>,
     load_avg: f64,
     dom_loads: Vec<f64>,
 
@@ -590,20 +574,16 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50;
 
     fn new(
-        maps: RustyMapsMut<'a>,
+        skel: &'a mut RustySkel<'b>,
         top: Arc<Topology>,
-        task_loads: &'b mut BTreeMap<i32, TaskLoad>,
-        load_decay_factor: f64,
         skip_kworkers: bool,
         nr_lb_data_errors: &'c mut u64,
     ) -> Self {
         Self {
-            maps,
-            task_loads,
-            load_decay_factor,
+            skel,
             skip_kworkers,
 
-            tasks_by_load: (0..top.nr_doms).map(|_| BTreeMap::<_, _>::new()).collect(),
+            tasks_by_load: (0..top.nr_doms).map(|_| None).collect(),
             load_avg: 0f64,
             dom_loads: vec![0.0; top.nr_doms],
 
@@ -617,85 +597,10 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         }
     }
 
-    fn read_task_loads(&mut self, period: Duration) -> Result<()> {
-        let now_mono = now_monotonic();
-        let task_data = self.maps.task_data();
-        let mut this_task_loads = BTreeMap::<i32, TaskLoad>::new();
-
-        for key in task_data.keys() {
-            if let Some(task_ctx_vec) = task_data
-                .lookup(&key, libbpf_rs::MapFlags::ANY)
-                .context("Failed to lookup task_data")?
-            {
-                let task_ctx =
-                    unsafe { &*(task_ctx_vec.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
-                let pid = i32::from_ne_bytes(
-                    key.as_slice()
-                        .try_into()
-                        .context("Invalid key length in task_data map")?,
-                );
-
-                let (this_at, this_for, weight) =
-                    (task_ctx.runnable_at, task_ctx.runnable_for, task_ctx.weight);
-
-                let (mut delta, prev_load) = match self.task_loads.get(&pid) {
-                    Some(prev) => (this_for - prev.runnable_for, Some(prev.load)),
-                    None => (this_for, None),
-                };
-
-                // Non-zero this_at indicates that the task is currently
-                // runnable. Note that we read runnable_at and runnable_for
-                // without any synchronization and there is a small window
-                // where we end up misaccounting. While this can cause
-                // temporary error, it's unlikely to cause any noticeable
-                // misbehavior especially given the load value clamping.
-                if this_at > 0 && this_at < now_mono {
-                    delta += now_mono - this_at;
-                }
-
-                delta = delta.min(period.as_nanos() as u64);
-                let this_load = (weight as f64 * delta as f64 / period.as_nanos() as f64)
-                    .clamp(0.0, weight as f64);
-
-                let this_load = match prev_load {
-                    Some(prev_load) => {
-                        prev_load * self.load_decay_factor
-                            + this_load * (1.0 - self.load_decay_factor)
-                    }
-                    None => this_load,
-                };
-
-                this_task_loads.insert(
-                    pid,
-                    TaskLoad {
-                        runnable_for: this_for,
-                        load: this_load,
-                    },
-                );
-
-                // Only record pids that are eligible for load balancing
-                if task_ctx.dom_mask == (1u64 << task_ctx.dom_id) {
-                    continue;
-                }
-                self.tasks_by_load[task_ctx.dom_id as usize].insert(
-                    OrderedFloat(this_load),
-                    TaskInfo {
-                        pid,
-                        dom_mask: task_ctx.dom_mask,
-                        migrated: Cell::new(false),
-                        is_kworker: task_ctx.is_kworker,
-                    },
-                );
-            }
-        }
-
-        *self.task_loads = this_task_loads;
-        Ok(())
-    }
-
     fn read_dom_loads(&mut self) -> Result<()> {
         let now_mono = now_monotonic();
-        let dom_data = self.maps.dom_data();
+        let maps = self.skel.maps();
+        let dom_data = maps.dom_data();
         let mut load_sum = 0.0f64;
 
         for i in 0..self.top.nr_doms {
@@ -740,6 +645,68 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         Ok(())
     }
 
+    fn populate_tasks_by_load(&mut self, dom: u32) -> Result<()> {
+        if self.tasks_by_load[dom as usize].is_some() {
+            return Ok(());
+        }
+
+        // Read active_pids and update write_idx and gen.
+        //
+        // XXX - We can't read task_ctx inline because self.skel.bss()
+        // borrows mutably and thus conflicts with self.skel.maps().
+        const MAX_PIDS: u64 = rusty_sys::MAX_DOM_ACTIVE_PIDS as u64;
+        let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize];
+        let mut pids = vec![];
+
+        let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx);
+        if widx - ridx > MAX_PIDS {
+            ridx = widx - MAX_PIDS;
+        }
+
+        for idx in ridx..widx {
+            let pid = active_pids.pids[(idx % MAX_PIDS) as usize];
+            pids.push(pid);
+        }
+
+        active_pids.read_idx = active_pids.write_idx;
+        active_pids.gen += 1;
+
+        // Read task_ctx and load.
+        let maps = self.skel.maps();
+        let task_data = maps.task_data();
+        let now_mono = now_monotonic();
+        let mut tasks_by_load = BTreeMap::new();
+
+        for pid in pids.iter() {
+            let key = unsafe { std::mem::transmute::<i32, [u8; 4]>(*pid) };
+
+            if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? {
+                let task_ctx =
+                    unsafe { &*(task_data_elem.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
+
+                let load = task_ctx.weight as f64
+                    * ravg_read(
+                        &task_ctx.dcyc_rd,
+                        now_mono,
+                        rusty_sys::USAGE_HALF_LIFE as u64,
+                    );
+
+                tasks_by_load.insert(
+                    OrderedFloat(load),
+                    TaskInfo {
+                        pid: *pid,
+                        dom_mask: task_ctx.dom_mask,
+                        migrated: Cell::new(false),
+                        is_kworker: task_ctx.is_kworker,
+                    },
+                );
+            }
+        }
+
+        self.tasks_by_load[dom as usize] = Some(tasks_by_load);
+        Ok(())
+    }
+
     // Find the first candidate pid which hasn't already been migrated and
     // can run in @pull_dom.
     fn find_first_candidate<'d, I>(
@@ -765,10 +732,10 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     }
 
     fn pick_victim(
-        &self,
+        &mut self,
         (push_dom, to_push): (u32, f64),
         (pull_dom, to_pull): (u32, f64),
-    ) -> Option<(&TaskInfo, f64)> {
+    ) -> Result<Option<(&TaskInfo, f64)>> {
         let to_xfer = to_pull.min(to_push) * Self::LOAD_IMBAL_XFER_TARGET_RATIO;
 
         trace!(
@@ -781,6 +748,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
         let calc_new_imbal = |xfer: f64| (to_push - xfer).abs() + (to_pull - xfer).abs();
 
+        self.populate_tasks_by_load(push_dom)?;
+
         trace!(
             "to_xfer={:.2} tasks_by_load={:?}",
             to_xfer,
@@ -797,6 +766,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         let (load, task, new_imbal) = match (
             Self::find_first_candidate(
                 self.tasks_by_load[push_dom as usize]
+                    .as_ref()
+                    .unwrap()
                     .range((Unbounded, Included(&OrderedFloat(to_xfer))))
                     .rev(),
                 pull_dom,
@@ -804,12 +775,14 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
             ),
             Self::find_first_candidate(
                 self.tasks_by_load[push_dom as usize]
+                    .as_ref()
+                    .unwrap()
                     .range((Included(&OrderedFloat(to_xfer)), Unbounded)),
                 pull_dom,
                 self.skip_kworkers,
             ),
         ) {
-            (None, None) => return None,
+            (None, None) => return Ok(None),
             (Some((load, task)), None) | (None, Some((load, task))) => {
                 (load, task, calc_new_imbal(load))
             }
@@ -835,7 +808,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                 old_imbal,
                 new_imbal
             );
-            return None;
+            return Ok(None);
         }
 
         trace!(
@@ -847,13 +820,13 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
             new_imbal,
         );
 
-        Some((task, load))
+        Ok(Some((task, load)))
     }
 
     // Actually execute the load balancing. Concretely this writes pid -> dom
     // entries into the lb_data map for bpf side to consume.
     fn load_balance(&mut self) -> Result<()> {
-        clear_map(self.maps.lb_data());
+        clear_map(self.skel.maps().lb_data());
 
         trace!("imbal={:?}", &self.imbal);
         trace!("doms_to_push={:?}", &self.doms_to_push);
@@ -875,7 +848,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
                 for (to_pull, pull_dom) in pull_doms.iter_mut() {
                     if let Some((task, load)) =
-                        self.pick_victim((push_dom, to_push), (*pull_dom, f64::from(*to_pull)))
+                        self.pick_victim((push_dom, to_push), (*pull_dom, f64::from(*to_pull)))?
                     {
                         // Execute migration.
                         task.migrated.set(true);
@@ -886,7 +859,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                         // Ask BPF code to execute the migration.
                         let pid = task.pid;
                         let cpid = (pid as libc::pid_t).to_ne_bytes();
-                        if let Err(e) = self.maps.lb_data().update(
+                        if let Err(e) = self.skel.maps_mut().lb_data().update(
                             &cpid,
                             &pull_dom.to_ne_bytes(),
                             libbpf_rs::MapFlags::NO_EXIST,
@@ -926,7 +899,6 @@ struct Scheduler<'a> {
 
     sched_interval: Duration,
     tune_interval: Duration,
-    load_decay_factor: f64,
     balance_load: bool,
     balanced_kworkers: bool,
 
@@ -935,7 +907,6 @@ struct Scheduler<'a> {
 
     prev_at: Instant,
     prev_total_cpu: procfs::CpuStat,
-    task_loads: BTreeMap<i32, TaskLoad>,
 
     nr_lb_data_errors: u64,
 
@@ -1013,7 +984,6 @@ impl<'a> Scheduler<'a> {
 
             sched_interval: Duration::from_secs_f64(opts.interval),
             tune_interval: Duration::from_secs_f64(opts.tune_interval),
-            load_decay_factor: opts.load_decay_factor.clamp(0.0, 0.99),
             balance_load: !opts.no_load_balance,
             balanced_kworkers: opts.balanced_kworkers,
 
@@ -1022,7 +992,6 @@ impl<'a> Scheduler<'a> {
 
             prev_at: Instant::now(),
             prev_total_cpu,
-            task_loads: BTreeMap::new(),
 
             nr_lb_data_errors: 0,
 
@@ -1198,10 +1167,8 @@ impl<'a> Scheduler<'a> {
         let cpu_busy = self.get_cpu_busy()?;
 
         let mut lb = LoadBalancer::new(
-            self.skel.maps_mut(),
+            &mut self.skel,
             self.top.clone(),
-            &mut self.task_loads,
-            self.load_decay_factor,
             self.balanced_kworkers,
             &mut self.nr_lb_data_errors,
         );
@@ -1209,8 +1176,7 @@ impl<'a> Scheduler<'a> {
         lb.read_dom_loads()?;
         lb.calculate_dom_load_balance()?;
 
-        if self.balance_load && lb.doms_to_push.len() > 0 {
-            lb.read_task_loads(started_at.duration_since(self.prev_at))?;
+        if self.balance_load {
             lb.load_balance()?;
         }
 

From ca211c601f4a3f10c20311f1203b79704bb388ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2023 23:50:26 -1000
Subject: [PATCH 163/304] ravg: Fix ravg_transfer()

---
 tools/sched_ext/ravg_impl.bpf.h               | 63 ++++++++++---------
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 13 ++--
 2 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h
index 3b254855220be..cef5efdfe3966 100644
--- a/tools/sched_ext/ravg_impl.bpf.h
+++ b/tools/sched_ext/ravg_impl.bpf.h
@@ -4,9 +4,9 @@
 #define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
 //#define RAVG_FN_ATTRS		__attribute__((unused))
 
-static RAVG_FN_ATTRS void ravg_add(__u64 *sum, __u64 addend)
+static RAVG_FN_ATTRS void ravg_add(u64 *sum, u64 addend)
 {
-	__u64 new = *sum + addend;
+	u64 new = *sum + addend;
 
 	if (new >= *sum)
 		*sum = new;
@@ -14,7 +14,7 @@ static RAVG_FN_ATTRS void ravg_add(__u64 *sum, __u64 addend)
 		*sum = -1;
 }
 
-static RAVG_FN_ATTRS __u64 ravg_decay(__u64 v, __u32 shift)
+static RAVG_FN_ATTRS u64 ravg_decay(u64 v, u32 shift)
 {
 	if (shift >= 64)
 		return 0;
@@ -22,10 +22,10 @@ static RAVG_FN_ATTRS __u64 ravg_decay(__u64 v, __u32 shift)
 		return v >> shift;
 }
 
-static RAVG_FN_ATTRS __u32 ravg_normalize_dur(__u32 dur, __u32 half_life)
+static RAVG_FN_ATTRS u32 ravg_normalize_dur(u32 dur, u32 half_life)
 {
 	if (dur < half_life)
-		return (((__u64)dur << RAVG_FRAC_BITS) + half_life - 1) /
+		return (((u64)dur << RAVG_FRAC_BITS) + half_life - 1) /
 			half_life;
 	else
 		return 1 << RAVG_FRAC_BITS;
@@ -40,7 +40,7 @@ static RAVG_FN_ATTRS __u32 ravg_normalize_dur(__u32 dur, __u32 half_life)
  * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3)
  * ...
  */
-static __u64 ravg_full_sum[] = {
+static u64 ravg_full_sum[] = {
 	 524288,  786432,  917504,  983040,
 	1015808, 1032192, 1040384, 1044480,
 	1046528, 1047552, 1048064, 1048320,
@@ -60,11 +60,10 @@ static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_su
  *
  * The current value is changing to @val at @now. Accumulate accordingly.
  */
-static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
-					  __u64 new_val, __u64 now,
-					  __u32 half_life)
+static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd, u64 new_val, u64 now,
+					  u32 half_life)
 {
-	__u32 cur_seq, val_seq, seq_delta;
+	u32 cur_seq, val_seq, seq_delta;
 
 	/*
 	 * It may be difficult for the caller to guarantee monotonic progress if
@@ -111,7 +110,7 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
 	 * seq delta                  [  3  |    2    |    1    |  0  ]
 	 */
 	if (seq_delta > 0) {
-		__u32 dur;
+		u32 dur;
 
 		/* fold the oldest period which may be partial */
 		dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life);
@@ -119,7 +118,7 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
 
 		/* fold the full periods in the middle with precomputed vals */
 		if (seq_delta > 1) {
-			__u32 idx = seq_delta - 2;
+			u32 idx = seq_delta - 2;
 
 			if (idx >= ravg_full_sum_len)
 				idx = ravg_full_sum_len - 1;
@@ -145,7 +144,9 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
 /**
  * ravg_transfer - Transfer in or out a component running avg
  * @base: ravg_data to transfer @xfer into or out of
+ * @base_new_val: new value for @base
  * @xfer: ravg_data to transfer
+ * @xfer_new_val: new value for @xfer
  * @is_xfer_in: transfer direction
  *
  * An ravg may be a sum of component ravgs. For example, a scheduling domain's
@@ -157,12 +158,17 @@ static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
  * must have been accumulated at the same timestamp. @xfer's contribution is
  * subtracted if @is_fer_in is %false and added if %true.
  */
-static RAVG_FN_ATTRS int ravg_transfer(struct ravg_data *base, struct ravg_data *xfer,
-				       bool is_xfer_in)
+static RAVG_FN_ATTRS void ravg_transfer(struct ravg_data *base, u64 base_new_val,
+					struct ravg_data *xfer, u64 xfer_new_val,
+					u32 half_life, bool is_xfer_in)
 {
-	if (base->val_at != xfer->val_at)
-		return -EINVAL;
+	/* synchronize @base and @xfer */
+	if ((s64)(base->val_at - xfer->val_at) < 0)
+		ravg_accumulate(base, base_new_val, xfer->val_at, half_life);
+	else if ((s64)(base->val_at - xfer->val_at) > 0)
+		ravg_accumulate(xfer, xfer_new_val, base->val_at, half_life);
 
+	/* transfer */
 	if (is_xfer_in) {
 		base->old += xfer->old;
 		base->cur += xfer->cur;
@@ -177,8 +183,6 @@ static RAVG_FN_ATTRS int ravg_transfer(struct ravg_data *base, struct ravg_data
 		else
 			base->cur = 0;
 	}
-
-	return 0;
 }
 
 /**
@@ -192,11 +196,11 @@ static RAVG_FN_ATTRS int ravg_transfer(struct ravg_data *base, struct ravg_data
  * ensure that the final shifted result fits in u64.
  */
 static inline __attribute__((always_inline))
-__u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
+u64 u64_x_u32_rshift(u64 a, u32 b, u32 rshift)
 {
-	const __u64 mask32 = (__u32)-1;
-	__u64 al = a & mask32;
-	__u64 ah = (a & (mask32 << 32)) >> 32;
+	const u64 mask32 = (u32)-1;
+	u64 al = a & mask32;
+	u64 ah = (a & (mask32 << 32)) >> 32;
 
 	/*
 	 *                                        ah: high 32     al: low 32
@@ -247,11 +251,10 @@ static RAVG_FN_ATTRS void ravg_scale(struct ravg_data *rd, u32 mult, u32 rshift)
  *
  * Read running avg from @rd as of @now.
  */
-static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
-				     __u64 half_life)
+static RAVG_FN_ATTRS u64 ravg_read(struct ravg_data *rd, u64 now, u64 half_life)
 {
 	struct ravg_data trd;
-	__u32 elapsed;
+	u32 elapsed;
 
 	/*
 	 * It may be difficult for the caller to guarantee monotonic progress if
@@ -278,7 +281,7 @@ static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
 	 * + current load / 2. Inbetween, we blend the two linearly.
 	 */
 	if (elapsed) {
-		__u32 progress = ravg_normalize_dur(elapsed, half_life);
+		u32 progress = ravg_normalize_dur(elapsed, half_life);
 		/*
 		 * `H` is the duration of the half-life window, and `E` is how
 		 * much time has elapsed in this window. `P` is [0.0, 1.0]
@@ -309,9 +312,9 @@ static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
 		 * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle
 		 * the interim multiplication correctly.
 		 */
-		__u64 old = u64_x_u32_rshift(rd->old,
-					(1 << RAVG_FRAC_BITS) - progress / 2,
-					RAVG_FRAC_BITS);
+		u64 old = u64_x_u32_rshift(rd->old,
+					   (1 << RAVG_FRAC_BITS) - progress / 2,
+					   RAVG_FRAC_BITS);
 		/*
 		 * If `S` is the Sum(val * duration) for this half-life window,
 		 * the avg for this window is:
@@ -347,7 +350,7 @@ static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
 		 *
 		 *  rd->cur / 2
 		 */
-		__u64 cur = rd->cur / 2;
+		u64 cur = rd->cur / 2;
 
 		return old + cur;
 	} else {
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 9e28d9d9bf13d..538a3d5b5c71d 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -161,7 +161,6 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	struct lock_wrapper *from_lockw, *to_lockw;
 	struct ravg_data task_load_rd;
 	u64 from_load[2], to_load[2], task_load;
-	int ret;
 
 	from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
 	from_lockw = bpf_map_lookup_elem(&dom_load_locks, &from_dom_id);
@@ -188,37 +187,33 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	bpf_spin_lock(&from_lockw->lock);
 	if (taskc->runnable)
 		from_domc->load -= p->scx.weight;
-	ravg_accumulate(&from_domc->load_rd, from_domc->load, now, USAGE_HALF_LIFE);
 
 	if (debug >= 2)
 		from_load[0] = ravg_read(&from_domc->load_rd, now, USAGE_HALF_LIFE);
 
-	ret = ravg_transfer(&from_domc->load_rd, &task_load_rd, false);
+	ravg_transfer(&from_domc->load_rd, from_domc->load,
+		      &task_load_rd, taskc->runnable, USAGE_HALF_LIFE, false);
 
 	if (debug >= 2)
 		from_load[1] = ravg_read(&from_domc->load_rd, now, USAGE_HALF_LIFE);
 
 	bpf_spin_unlock(&from_lockw->lock);
-	if (ret < 0)
-		scx_bpf_error("Failed to transfer out load");
 
 	/* transfer into @to_dom_id */
 	bpf_spin_lock(&to_lockw->lock);
 	if (taskc->runnable)
 		to_domc->load += p->scx.weight;
-	ravg_accumulate(&to_domc->load_rd, to_domc->load, now, USAGE_HALF_LIFE);
 
 	if (debug >= 2)
 		to_load[0] = ravg_read(&to_domc->load_rd, now, USAGE_HALF_LIFE);
 
-	ret = ravg_transfer(&to_domc->load_rd, &task_load_rd, true);
+	ravg_transfer(&to_domc->load_rd, to_domc->load,
+		      &task_load_rd, taskc->runnable, USAGE_HALF_LIFE, true);
 
 	if (debug >= 2)
 		to_load[1] = ravg_read(&to_domc->load_rd, now, USAGE_HALF_LIFE);
 
 	bpf_spin_unlock(&to_lockw->lock);
-	if (ret < 0)
-		scx_bpf_error("Failed to transfer in load");
 
 	if (debug >= 2)
 		bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",

From 8111b6e79596fc197a78990d5a60e60c02224e08 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Nov 2023 00:14:01 -1000
Subject: [PATCH 164/304] scx_rusty: Improve debug messages

---
 tools/sched_ext/scx_rusty/src/main.rs | 43 +++++++++++----------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 87f2cc1cc4ccf..dcd44b4b8471a 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -29,6 +29,7 @@ use clap::Parser;
 use libbpf_rs::skel::OpenSkel as _;
 use libbpf_rs::skel::Skel as _;
 use libbpf_rs::skel::SkelBuilder as _;
+use log::debug;
 use log::info;
 use log::trace;
 use log::warn;
@@ -703,6 +704,13 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
             }
         }
 
+        debug!(
+            "DOM[{:02}] read load for {} tasks",
+            dom,
+            &tasks_by_load.len(),
+        );
+        trace!("DOM[{:02}] tasks_by_load={:?}", dom, &tasks_by_load);
+
         self.tasks_by_load[dom as usize] = Some(tasks_by_load);
         Ok(())
     }
@@ -738,24 +746,15 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     ) -> Result<Option<(&TaskInfo, f64)>> {
         let to_xfer = to_pull.min(to_push) * Self::LOAD_IMBAL_XFER_TARGET_RATIO;
 
-        trace!(
+        debug!(
             "considering dom {}@{:.2} -> {}@{:.2}",
-            push_dom,
-            to_push,
-            pull_dom,
-            to_pull
+            push_dom, to_push, pull_dom, to_pull
         );
 
         let calc_new_imbal = |xfer: f64| (to_push - xfer).abs() + (to_pull - xfer).abs();
 
         self.populate_tasks_by_load(push_dom)?;
 
-        trace!(
-            "to_xfer={:.2} tasks_by_load={:?}",
-            to_xfer,
-            &self.tasks_by_load[push_dom as usize]
-        );
-
         // We want to pick a task to transfer from push_dom to pull_dom to
         // reduce the load imbalance between the two closest to $to_xfer.
         // IOW, pick a task which has the closest load value to $to_xfer
@@ -800,24 +799,16 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         // to do for this pair.
         let old_imbal = to_push + to_pull;
         if old_imbal < new_imbal {
-            trace!(
+            debug!(
                 "skipping pid {}, dom {} -> {} won't improve imbal {:.2} -> {:.2}",
-                task.pid,
-                push_dom,
-                pull_dom,
-                old_imbal,
-                new_imbal
+                task.pid, push_dom, pull_dom, old_imbal, new_imbal
             );
             return Ok(None);
         }
 
-        trace!(
+        debug!(
             "migrating pid {}, dom {} -> {}, imbal={:.2} -> {:.2}",
-            task.pid,
-            push_dom,
-            pull_dom,
-            old_imbal,
-            new_imbal,
+            task.pid, push_dom, pull_dom, old_imbal, new_imbal,
         );
 
         Ok(Some((task, load)))
@@ -828,9 +819,9 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     fn load_balance(&mut self) -> Result<()> {
         clear_map(self.skel.maps().lb_data());
 
-        trace!("imbal={:?}", &self.imbal);
-        trace!("doms_to_push={:?}", &self.doms_to_push);
-        trace!("doms_to_pull={:?}", &self.doms_to_pull);
+        debug!("imbal={:?}", &self.imbal);
+        debug!("doms_to_push={:?}", &self.doms_to_push);
+        debug!("doms_to_pull={:?}", &self.doms_to_pull);
 
         // Push from the most imbalanced to least.
         while let Some((OrderedFloat(mut to_push), push_dom)) = self.doms_to_push.pop_last() {

From 46f07fa0fa558fd791ff893652e5e9a1c7d7daf8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Nov 2023 00:22:57 -1000
Subject: [PATCH 165/304] scx_rusty: Minor cleanup

---
 tools/sched_ext/scx_rusty/src/main.rs | 33 ++++++++++++---------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index dcd44b4b8471a..649f9c29e4507 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -35,6 +35,10 @@ use log::trace;
 use log::warn;
 use ordered_float::OrderedFloat;
 
+const USAGE_HALF_LIFE: u64 = rusty_sys::USAGE_HALF_LIFE as u64;
+const MAX_DOMS: usize = rusty_sys::MAX_DOMS as usize;
+const MAX_CPUS: usize = rusty_sys::MAX_CPUS as usize;
+
 /// scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
 /// part does simple round robin in each domain and the userspace part
 /// calculates the load factor of each domain and tells the BPF part how to load
@@ -290,16 +294,15 @@ struct Topology {
 
 impl Topology {
     fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result<Self> {
-        if cpumasks.len() > rusty_sys::MAX_DOMS as usize {
+        if cpumasks.len() > MAX_DOMS {
             bail!(
                 "Number of requested domains ({}) is greater than MAX_DOMS ({})",
                 cpumasks.len(),
-                rusty_sys::MAX_DOMS
+                MAX_DOMS
             );
         }
         let mut cpu_dom = vec![None; nr_cpus];
-        let mut dom_cpus =
-            vec![bitvec![u64, Lsb0; 0; rusty_sys::MAX_CPUS as usize]; cpumasks.len()];
+        let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; cpumasks.len()];
         for (dom, cpumask) in cpumasks.iter().enumerate() {
             let hex_str = {
                 let mut tmp_str = cpumask
@@ -403,16 +406,16 @@ impl Topology {
             nr_doms += 1;
         }
 
-        if nr_doms > rusty_sys::MAX_DOMS as usize {
+        if nr_doms > MAX_DOMS {
             bail!(
                 "Total number of doms {} is greater than MAX_DOMS ({})",
                 nr_doms,
-                rusty_sys::MAX_DOMS
+                MAX_DOMS
             );
         }
 
         // Build and return dom -> cpumask and cpu -> dom mappings.
-        let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; rusty_sys::MAX_CPUS as usize]; nr_doms];
+        let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; nr_doms];
         let mut cpu_dom = vec![];
 
         for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) {
@@ -615,11 +618,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx)
                 };
 
-                self.dom_loads[i] = ravg_read(
-                    &dom_ctx.load_rd,
-                    now_mono,
-                    rusty_sys::USAGE_HALF_LIFE as u64,
-                );
+                self.dom_loads[i] = ravg_read(&dom_ctx.load_rd, now_mono, USAGE_HALF_LIFE);
 
                 load_sum += self.dom_loads[i];
             }
@@ -686,11 +685,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     unsafe { &*(task_data_elem.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
 
                 let load = task_ctx.weight as f64
-                    * ravg_read(
-                        &task_ctx.dcyc_rd,
-                        now_mono,
-                        rusty_sys::USAGE_HALF_LIFE as u64,
-                    );
+                    * ravg_read(&task_ctx.dcyc_rd, now_mono, USAGE_HALF_LIFE);
 
                 tasks_by_load.insert(
                     OrderedFloat(load),
@@ -912,11 +907,11 @@ impl<'a> Scheduler<'a> {
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
         let nr_cpus = libbpf_rs::num_possible_cpus().unwrap();
-        if nr_cpus > rusty_sys::MAX_CPUS as usize {
+        if nr_cpus > MAX_CPUS {
             bail!(
                 "nr_cpus ({}) is greater than MAX_CPUS ({})",
                 nr_cpus,
-                rusty_sys::MAX_CPUS
+                MAX_CPUS
             );
         }
 

From f244d5eca1d2b7af184c8cfd8c81dcb1397f6003 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Nov 2023 06:13:14 -1000
Subject: [PATCH 166/304] scx_rusty: Comments, parameterize ravg half life

---
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 25 ++++++++-------
 tools/sched_ext/scx_rusty/src/bpf/rusty.h     | 20 +++++++++---
 tools/sched_ext/scx_rusty/src/main.rs         | 32 +++++++++++++------
 3 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 538a3d5b5c71d..9286833da571d 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -60,6 +60,7 @@ const volatile u32 nr_doms = 32;	/* !0 for veristat, set during init */
 const volatile u32 nr_cpus = 64;	/* !0 for veristat, set during init */
 const volatile u32 cpu_dom_id_map[MAX_CPUS];
 const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
+const volatile u32 load_half_life = 1000000000	/* 1s */;
 
 const volatile bool kthreads_local;
 const volatile bool fifo_sched;
@@ -137,7 +138,7 @@ static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 
 	bpf_spin_lock(&lockw->lock);
 	domc->load += adj;
-	ravg_accumulate(&domc->load_rd, domc->load, now, USAGE_HALF_LIFE);
+	ravg_accumulate(&domc->load_rd, domc->load, now, load_half_life);
 	bpf_spin_unlock(&lockw->lock);
 
 	if (adj < 0 && (s64)domc->load < 0)
@@ -149,7 +150,7 @@ static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
 		bpf_printk("LOAD ADJ dom=%u adj=%lld load=%llu",
 			   dom_id,
 			   adj,
-			   ravg_read(&domc->load_rd, now, USAGE_HALF_LIFE) >> RAVG_FRAC_BITS);
+			   ravg_read(&domc->load_rd, now, load_half_life) >> RAVG_FRAC_BITS);
 		domc->dbg_load_printed_at = now;
 	}
 }
@@ -176,12 +177,12 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 	 * should be moved together. We only track duty cycle for tasks. Scale
 	 * it by weight to get load_rd.
 	 */
-	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
 	task_load_rd = taskc->dcyc_rd;
 	ravg_scale(&task_load_rd, p->scx.weight, 0);
 
 	if (debug >= 2)
-		task_load = ravg_read(&task_load_rd, now, USAGE_HALF_LIFE);
+		task_load = ravg_read(&task_load_rd, now, load_half_life);
 
 	/* transfer out of @from_dom_id */
 	bpf_spin_lock(&from_lockw->lock);
@@ -189,13 +190,13 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 		from_domc->load -= p->scx.weight;
 
 	if (debug >= 2)
-		from_load[0] = ravg_read(&from_domc->load_rd, now, USAGE_HALF_LIFE);
+		from_load[0] = ravg_read(&from_domc->load_rd, now, load_half_life);
 
 	ravg_transfer(&from_domc->load_rd, from_domc->load,
-		      &task_load_rd, taskc->runnable, USAGE_HALF_LIFE, false);
+		      &task_load_rd, taskc->runnable, load_half_life, false);
 
 	if (debug >= 2)
-		from_load[1] = ravg_read(&from_domc->load_rd, now, USAGE_HALF_LIFE);
+		from_load[1] = ravg_read(&from_domc->load_rd, now, load_half_life);
 
 	bpf_spin_unlock(&from_lockw->lock);
 
@@ -205,13 +206,13 @@ static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
 		to_domc->load += p->scx.weight;
 
 	if (debug >= 2)
-		to_load[0] = ravg_read(&to_domc->load_rd, now, USAGE_HALF_LIFE);
+		to_load[0] = ravg_read(&to_domc->load_rd, now, load_half_life);
 
 	ravg_transfer(&to_domc->load_rd, to_domc->load,
-		      &task_load_rd, taskc->runnable, USAGE_HALF_LIFE, true);
+		      &task_load_rd, taskc->runnable, load_half_life, true);
 
 	if (debug >= 2)
-		to_load[1] = ravg_read(&to_domc->load_rd, now, USAGE_HALF_LIFE);
+		to_load[1] = ravg_read(&to_domc->load_rd, now, load_half_life);
 
 	bpf_spin_unlock(&to_lockw->lock);
 
@@ -798,7 +799,7 @@ void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
 	taskc->runnable = true;
 	taskc->is_kworker = p->flags & PF_WQ_WORKER;
 
-	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
 	dom_load_adj(taskc->dom_id, p->scx.weight, now);
 }
 
@@ -883,7 +884,7 @@ void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
 
 	taskc->runnable = false;
 
-	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, USAGE_HALF_LIFE);
+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
 	dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
 }
 
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index d6722e8ebebbc..bb40c34f43a07 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -19,11 +19,21 @@ typedef unsigned long long u64;
 
 #include "../../../ravg.bpf.h"
 
-#define	MAX_CPUS	512
-#define	MAX_DOMS	64		/* limited to avoid complex bitmask ops */
-#define	CACHELINE_SIZE	64
-#define USAGE_HALF_LIFE	1000000000	/* 1s */
-#define MAX_DOM_ACTIVE_PIDS 1024	/* LB looks at the latest 1k active tasks per dom */
+enum consts {
+	MAX_CPUS		= 512,
+	MAX_DOMS		= 64,	/* limited to avoid complex bitmask ops */
+	CACHELINE_SIZE		= 64,
+
+	/*
+	 * When userspace load balancer is trying to determine the tasks to push
+	 * out from an overloaded domain, it looks at the the following number
+	 * of recently active tasks of the domain. While this may lead to
+	 * spurious migration victim selection failures in pathological cases,
+	 * this isn't a practical problem as the LB rounds are best-effort
+	 * anyway and will be retried until loads are balanced.
+	 */
+	MAX_DOM_ACTIVE_PIDS	= 1024,
+};
 
 /* Statistics */
 enum stat_idx {
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 649f9c29e4507..e7d509bdf34ca 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -35,9 +35,8 @@ use log::trace;
 use log::warn;
 use ordered_float::OrderedFloat;
 
-const USAGE_HALF_LIFE: u64 = rusty_sys::USAGE_HALF_LIFE as u64;
-const MAX_DOMS: usize = rusty_sys::MAX_DOMS as usize;
-const MAX_CPUS: usize = rusty_sys::MAX_CPUS as usize;
+const MAX_DOMS: usize = rusty_sys::consts_MAX_DOMS as usize;
+const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize;
 
 /// scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
 /// part does simple round robin in each domain and the userspace part
@@ -68,6 +67,10 @@ struct Opts {
     #[clap(short = 'I', long, default_value = "0.1")]
     tune_interval: f64,
 
+    /// The half-life of task and domain load running averages in seconds.
+    #[clap(short = 'l', long, default_value = "1.0")]
+    load_half_life: f64,
+
     /// Build domains according to how CPUs are grouped at this cache level
     /// as determined by /sys/devices/system/cpu/cpuX/cache/indexI/id.
     #[clap(short = 'c', long, default_value = "3")]
@@ -131,8 +134,9 @@ struct Opts {
     verbose: u8,
 }
 
-fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u64) -> f64 {
+fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u32) -> f64 {
     const RAVG_1: f64 = (1 << rusty_sys::ravg_consts_RAVG_FRAC_BITS) as f64;
+    let half_life = half_life as u64;
     let val = rd.val as f64;
     let val_at = rd.val_at;
     let mut old = rd.old as f64 / RAVG_1;
@@ -603,6 +607,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
     fn read_dom_loads(&mut self) -> Result<()> {
         let now_mono = now_monotonic();
+        let load_half_life = self.skel.rodata().load_half_life;
         let maps = self.skel.maps();
         let dom_data = maps.dom_data();
         let mut load_sum = 0.0f64;
@@ -618,7 +623,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx)
                 };
 
-                self.dom_loads[i] = ravg_read(&dom_ctx.load_rd, now_mono, USAGE_HALF_LIFE);
+                self.dom_loads[i] = ravg_read(&dom_ctx.load_rd, now_mono, load_half_life);
 
                 load_sum += self.dom_loads[i];
             }
@@ -629,7 +634,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         Ok(())
     }
 
-    // To balance dom loads we identify doms with lower and higher load than average
+    /// To balance dom loads, identify doms with lower and higher load than
+    /// average.
     fn calculate_dom_load_balance(&mut self) -> Result<()> {
         for (dom, dom_load) in self.dom_loads.iter().enumerate() {
             let imbal = dom_load - self.load_avg;
@@ -645,6 +651,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         Ok(())
     }
 
+    /// @dom needs to push out tasks to balance loads. Make sure its
+    /// tasks_by_load is populated so that the victim tasks can be picked.
     fn populate_tasks_by_load(&mut self, dom: u32) -> Result<()> {
         if self.tasks_by_load[dom as usize].is_some() {
             return Ok(());
@@ -654,7 +662,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         //
         // XXX - We can't read task_ctx inline because self.skel.bss()
         // borrows mutably and thus conflicts with self.skel.maps().
-        const MAX_PIDS: u64 = rusty_sys::MAX_DOM_ACTIVE_PIDS as u64;
+        const MAX_PIDS: u64 = rusty_sys::consts_MAX_DOM_ACTIVE_PIDS as u64;
         let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize];
         let mut pids = vec![];
 
@@ -672,6 +680,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         active_pids.gen += 1;
 
         // Read task_ctx and load.
+        let load_half_life = self.skel.rodata().load_half_life;
         let maps = self.skel.maps();
         let task_data = maps.task_data();
         let now_mono = now_monotonic();
@@ -684,8 +693,12 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                 let task_ctx =
                     unsafe { &*(task_data_elem.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
 
-                let load = task_ctx.weight as f64
-                    * ravg_read(&task_ctx.dcyc_rd, now_mono, USAGE_HALF_LIFE);
+                if task_ctx.dom_id != dom {
+                    continue;
+                }
+
+                let load =
+                    task_ctx.weight as f64 * ravg_read(&task_ctx.dcyc_rd, now_mono, load_half_life);
 
                 tasks_by_load.insert(
                     OrderedFloat(load),
@@ -943,6 +956,7 @@ impl<'a> Scheduler<'a> {
         }
 
         skel.rodata().slice_ns = opts.slice_us * 1000;
+        skel.rodata().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
         skel.rodata().kthreads_local = opts.kthreads_local;
         skel.rodata().fifo_sched = opts.fifo_sched;
         skel.rodata().switch_partial = opts.partial;

From a60668f46f5940b87f3f705ac45e827187230711 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 5 Nov 2023 11:40:37 -1000
Subject: [PATCH 167/304] sched_ext: Test sched_class directly in
 scx_task_iter_next_filtered()

scx_task_iter_next_filtered() is used to iterate all non-idle tasks in the
init and exit paths. Idle tasks are determined using is_idle_task().
Unfortunately, cff9b2332ab7 ("kernel/sched: Modify initial boot task idle
setup") changed idle task initialization so that %PF_IDLE is set during CPU
startup. So, CPUs that are not brought up during boot (such as CPUs which
can never be online in some AMD processors) don't have the flag set and thus
fails is_idle_task() test.

This makes sched_ext incorrectly try to operate on idle tasks in init/exit
paths leading to oopses. Fix it by directly testing p->sched_class against
idle_sched_class.
---
 kernel/sched/ext.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d59c3130cc135..5f96790ce9782 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -432,7 +432,11 @@ scx_task_iter_next_filtered(struct scx_task_iter *iter)
 	struct task_struct *p;
 
 	while ((p = scx_task_iter_next(iter))) {
-		if (!is_idle_task(p))
+		/*
+		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
+		 * which haven't yet been onlined. Test sched_class directly.
+		 */
+		if (p->sched_class != &idle_sched_class)
 			return p;
 	}
 	return NULL;

From dfee93e25773a35bcdc25fdbe0b74bc9a895f765 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 3 Nov 2023 23:09:12 +0100
Subject: [PATCH 168/304] selftests/bpf: Disable CONFIG_DEBUG_INFO_REDUCED in
 config.aarch64

Building an arm64 kernel and seftests/bpf with defconfig +
selftests/bpf/config and selftests/bpf/config.aarch64 the fragment
CONFIG_DEBUG_INFO_REDUCED is enabled in arm64's defconfig, it should be
disabled in file sefltests/bpf/config.aarch64 since if its not disabled
CONFIG_DEBUG_INFO_BTF wont be enabled.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231103220912.333930-1-anders.roxell@linaro.org
---
 tools/testing/selftests/bpf/config.aarch64 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64
index fa8ecf626c73e..29c8635c57220 100644
--- a/tools/testing/selftests/bpf/config.aarch64
+++ b/tools/testing/selftests/bpf/config.aarch64
@@ -36,6 +36,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DEBUG_INFO_BTF=y
 CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_DEBUG_INFO_REDUCED=n
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_LOCKDEP=y
 CONFIG_DEBUG_NOTIFIERS=y

From 856624f12b04a3f51094fa277a31a333ee81cb3f Mon Sep 17 00:00:00 2001
From: Florian Lehner <dev@der-flo.net>
Date: Sun, 5 Nov 2023 09:58:01 +0100
Subject: [PATCH 169/304] bpf, lpm: Fix check prefixlen before walking trie

When looking up an element in LPM trie, the condition 'matchlen ==
trie->max_prefixlen' will never return true, if key->prefixlen is larger
than trie->max_prefixlen. Consequently all elements in the LPM trie will
be visited and no element is returned in the end.

To resolve this, check key->prefixlen first before walking the LPM trie.

Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation")
Signed-off-by: Florian Lehner <dev@der-flo.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231105085801.3742-1-dev@der-flo.net
---
 kernel/bpf/lpm_trie.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 17c7e7782a1f7..b32be680da6cd 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -231,6 +231,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 	struct lpm_trie_node *node, *found = NULL;
 	struct bpf_lpm_trie_key *key = _key;
 
+	if (key->prefixlen > trie->max_prefixlen)
+		return NULL;
+
 	/* Start walking the trie from the root node ... */
 
 	for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());

From 53cb301b01b80e26fb7f7e5f72e70cdebb0a9362 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 10:19:50 -1000
Subject: [PATCH 170/304] sched_ext/ravg: Drop unnecessary comment line

---
 tools/sched_ext/ravg_impl.bpf.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h
index cef5efdfe3966..4922a3e689bc8 100644
--- a/tools/sched_ext/ravg_impl.bpf.h
+++ b/tools/sched_ext/ravg_impl.bpf.h
@@ -2,7 +2,6 @@
 #include "ravg.bpf.h"
 
 #define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
-//#define RAVG_FN_ATTRS		__attribute__((unused))
 
 static RAVG_FN_ATTRS void ravg_add(u64 *sum, u64 addend)
 {

From f5994832887452b1466c94ba65104b6b2b50a748 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 12:34:54 -1000
Subject: [PATCH 171/304] tools/sched_ext/Makefile: Don't hard code scx_rusty
 in rust-sched _deps target

---
 tools/sched_ext/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 3865ae042f8bc..91a2fc8914fa3 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -209,7 +209,8 @@ rust-sched-targets := scx_rusty
 # CARGO_OFFLINE=1 (e.g. if building locally), then cargo build will download
 # all of the necessary dependencies, and the deps target can be skipped.
 $(addsuffix _deps,$(rust-sched-targets)):
-	$(Q)cargo fetch --manifest-path=scx_rusty/Cargo.toml
+	$(eval sched=$(@:_deps=))
+	$(Q)cargo fetch --manifest-path=$(sched)/Cargo.toml
 
 $(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
 	$(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR))

From 1dff6eac6ec7a857937590c9e2a176fcad3a3df8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 13:13:12 -1000
Subject: [PATCH 172/304] scx_common: Improve MEMBER_VPTR()

So that it can be used on deref'd pointers to structs.
---
 tools/sched_ext/scx_common.bpf.h | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 38168981fd0b7..948f89d1bd91b 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -104,7 +104,7 @@ BPF_PROG(name, ##args)
 /**
  * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
  * @base: struct or array to index
- * @member: dereferenced member (e.g. ->field, [idx0][idx1], ...)
+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
  *
  * The verifier often gets confused by the instruction sequence the compiler
  * generates for indexing struct fields or arrays. This macro forces the
@@ -113,19 +113,24 @@ BPF_PROG(name, ##args)
  * generate the pointer to the member to help the verifier.
  *
  * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
- * BPF currently doesn't support abort, so evaluate to NULL instead. The caller
- * must check for NULL and take appropriate action to appease the verifier. To
- * avoid confusing the verifier, it's best to check for NULL and dereference
+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
+ * must check for %NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for %NULL and dereference
  * immediately.
  *
  *	vptr = MEMBER_VPTR(my_array, [i][j]);
  *	if (!vptr)
  *		return error;
  *	*vptr = new_value;
+ *
+ * sizeof(@base) should encompass the memory area to be accessed and thus can't
+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
+ * `MEMBER_VPTR(ptr, ->member)`.
  */
-#define MEMBER_VPTR(base, member) (typeof(base member) *)({			\
-	u64 __base = (u64)base;							\
-	u64 __addr = (u64)&(base member) - __base;				\
+#define MEMBER_VPTR(base, member) (typeof((base) member) *)({			\
+	u64 __base = (u64)&(base);						\
+	u64 __addr = (u64)&((base) member) - __base;				\
+	_Static_assert(sizeof(base) >= sizeof((base) member));			\
 	asm volatile (								\
 		"if %0 <= %[max] goto +2\n"					\
 		"%0 = 0\n"							\
@@ -133,7 +138,7 @@ BPF_PROG(name, ##args)
 		"%0 += %1\n"							\
 		: "+r"(__addr)							\
 		: "r"(__base),							\
-		  [max]"i"(sizeof(base) - sizeof(base member)));		\
+		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
 	__addr;									\
 })
 

From 1d773bdc346e9c0796087047181c36ca34e15f4c Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 6 Nov 2023 20:22:25 -0600
Subject: [PATCH 173/304] scx: Fix !CONFIG_SCHED_CLASS_EXT builds

cpu_local_stat_show() expects CONFIG_SCHED_CLASS_EXT or
CONFIG_RT_GROUP_SCHED.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c01006fba569d..57ba9b090455a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11421,6 +11421,7 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 	return 0;
 }
 
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 static int cpu_local_stat_show(struct seq_file *sf,
 			       struct cgroup_subsys_state *css)
 {
@@ -11438,6 +11439,7 @@ static int cpu_local_stat_show(struct seq_file *sf,
 #endif
 	return 0;
 }
+#endif
 
 #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 

From 54d303dfe617a1678de15fdb544b7e41ff838542 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 31 Oct 2023 21:40:47 -0500
Subject: [PATCH 174/304] scx: Print scx info when dumping stack

It would be useful to see what the sched_ext scheduler state is, and
what scheduler is running, when we're dumping a task's stack. This patch
therefore adds a new print_scx_info() function that's called in the same
context as print_worker_info() and print_stop_info().

Signed-off-by: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h |  2 ++
 kernel/sched/core.c       |  1 +
 kernel/sched/ext.c        | 62 +++++++++++++++++++++++++++++++++++++++
 lib/dump_stack.c          |  1 +
 4 files changed, 66 insertions(+)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 24f74ebeb7af1..62b6e1c34d0f1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -705,10 +705,12 @@ struct sched_ext_entity {
 };
 
 void sched_ext_free(struct task_struct *p);
+void print_scx_info(const char *log_lvl, struct task_struct *p);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 #endif	/* _LINUX_SCHED_EXT_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c01006fba569d..00c9166b4af88 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9267,6 +9267,7 @@ void sched_show_task(struct task_struct *p)
 
 	print_worker_info(KERN_INFO, p);
 	print_stop_info(KERN_INFO, p);
+	print_scx_info(KERN_INFO, p);
 	show_stack(p, NULL, KERN_INFO);
 	put_task_stack(p);
 }
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5f96790ce9782..5cd65c18b1675 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3688,6 +3688,68 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 	cpumask_clear(this_rq->scx.cpus_to_wait);
 }
 
+/**
+ * print_scx_info - print out sched_ext scheduler state
+ * @log_lvl: the log level to use when printing
+ * @p: target task
+ *
+ * If @task is running on a sched_ext scheduler, print out the name of the
+ * sched_ext scheduler, and other various scheduler-related debugging
+ * information about the task.
+ *
+ * This function can be safely called on any task as long as the
+ * task_struct itself is accessible. While safe, this function isn't
+ * synchronized and may print out mixups or garbages of limited length.
+ */
+void print_scx_info(const char *log_lvl, struct task_struct *p)
+{
+	struct sched_class *class = NULL;
+	enum scx_ops_enable_state state = scx_ops_enable_state();
+	s64 delta = 0;
+	long ops_state = 0;
+	int task_cpu;
+	struct thread_info *thread_info;
+	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
+
+	if (!scx_enabled() || state == SCX_OPS_DISABLED)
+		return;
+
+	/*
+	 * Carefully check if the task was running on sched_ext, and then
+	 * carefully copy the time it's been runnable, and its state.
+	 */
+	copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class));
+	if (!class || class != &ext_sched_class) {
+		printk("%ssched_ext: %s (%s%s)", log_lvl, scx_ops.name,
+		       scx_ops_enable_state_str[state], all);
+		return;
+	}
+
+	copy_from_kernel_nofault(&thread_info, task_thread_info(p),
+				 sizeof(thread_info));
+	copy_from_kernel_nofault(&task_cpu, &thread_info->cpu,
+				 sizeof(task_cpu));
+	if (ops_cpu_valid(task_cpu)) {
+		struct rq *task_rq;
+		u64 rq_clock;
+		unsigned long runnable_at;
+
+		task_rq = cpu_rq(task_cpu);
+		copy_from_kernel_nofault(&rq_clock, &task_rq->clock,
+					 sizeof(rq_clock));
+		copy_from_kernel_nofault(&ops_state, &p->scx.ops_state.counter,
+					 sizeof(ops_state));
+		copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
+					 sizeof(runnable_at));
+		delta = rq_clock - runnable_at;
+	}
+
+	/* Print everything onto one line to conserve console spce. */
+	printk("%ssched_ext: %s (%s%s), task: runnable_at=%+lld state=%#lx",
+	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+	       delta, ops_state);
+}
+
 void __init init_sched_ext_class(void)
 {
 	int cpu;
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 83471e81501a7..6e667c445539b 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl)
 
 	print_worker_info(log_lvl, current);
 	print_stop_info(log_lvl, current);
+	print_scx_info(log_lvl, current);
 }
 
 /**

From 5c2f39c966d178c8451e709430d854b50d286ad5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 17:51:20 -1000
Subject: [PATCH 175/304] scx_common: Add message to _Static_assert in
 MEMBER_VPTR

_Static_assert() without message is a later extension and can fail
compilation depending on compile flag.
---
 tools/sched_ext/scx_common.bpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 948f89d1bd91b..5c503c2358368 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -130,7 +130,8 @@ BPF_PROG(name, ##args)
 #define MEMBER_VPTR(base, member) (typeof((base) member) *)({			\
 	u64 __base = (u64)&(base);						\
 	u64 __addr = (u64)&((base) member) - __base;				\
-	_Static_assert(sizeof(base) >= sizeof((base) member));			\
+	_Static_assert(sizeof(base) >= sizeof((base) member),			\
+		       "@base is smaller than @member, is @base a pointer?");	\
 	asm volatile (								\
 		"if %0 <= %[max] goto +2\n"					\
 		"%0 = 0\n"							\

From a32fa875b39eddd1964ff1110329640a96217d57 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 18:14:09 -1000
Subject: [PATCH 176/304] tools/sched_ext/ravg: Separate out ravg_read.rs.h and
 update build deps

We want to use rust ravg_read() in other implementations too. Separate out
it into a .h file and include it. Note that it also needs to take the inputs
in scalar types as the ravg_data types aren't considered the same across
different skel's. This can also be a module but for now let's keep it an
include file so that it can be copied elsewhere together with the BPF header
files.

While at it, make BPF builds depend on ravg[_impl].bpf.h. cargo does the
right thing without further instructions.
---
 tools/sched_ext/Makefile              |  5 +-
 tools/sched_ext/ravg_read.rs.h        | 82 ++++++++++++++++++++++
 tools/sched_ext/scx_rusty/src/main.rs | 99 +++++++--------------------
 3 files changed, 109 insertions(+), 77 deletions(-)
 create mode 100644 tools/sched_ext/ravg_read.rs.h

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 91a2fc8914fa3..43f41688753f2 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -163,8 +163,9 @@ else
 	$(Q)cp "$(VMLINUX_H)" $@
 endif
 
-$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
-	| $(BPFOBJ) $(SCXOBJ_DIR)
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h	\
+		       user_exit_info.h ravg.bpf.h ravg_impl.bpf.h		\
+		       | $(BPFOBJ) $(SCXOBJ_DIR)
 	$(call msg,CLNG-BPF,,$(notdir $@))
 	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
 
diff --git a/tools/sched_ext/ravg_read.rs.h b/tools/sched_ext/ravg_read.rs.h
new file mode 100644
index 0000000000000..4efaa2390aa61
--- /dev/null
+++ b/tools/sched_ext/ravg_read.rs.h
@@ -0,0 +1,82 @@
+/// ravg_read() implementation for rust userland. See ravg_read() in
+/// ravg_impl.bpf.h. We don't yet have a good mechanism to share BPF and
+/// matching rust code across multiple schedulers. For now, include both BPF
+/// and rust code from scheduler implementations.
+fn ravg_read(
+    val: u64,
+    val_at: u64,
+    old: u64,
+    cur: u64,
+    now: u64,
+    half_life: u32,
+    frac_bits: u32,
+) -> f64 {
+    let ravg_1: f64 = (1 << frac_bits) as f64;
+    let half_life = half_life as u64;
+    let val = val as f64;
+    let mut old = old as f64 / ravg_1;
+    let mut cur = cur as f64 / ravg_1;
+
+    let now = now.max(val_at);
+    let normalized_dur = |dur| dur as f64 / half_life as f64;
+
+    //
+    // The following is f64 implementation of BPF ravg_accumulate().
+    //
+    let cur_seq = (now / half_life) as i64;
+    let val_seq = (val_at / half_life) as i64;
+    let seq_delta = (cur_seq - val_seq) as i32;
+
+    if seq_delta > 0 {
+        let full_decay = 2f64.powi(seq_delta);
+
+        // Decay $old and fold $cur into it.
+        old /= full_decay;
+        old += cur / full_decay;
+        cur = 0.0;
+
+        // Fold the oldest period whicy may be partial.
+        old += val * normalized_dur(half_life - val_at % half_life) / full_decay;
+
+        // Pre-computed decayed full-period values.
+        const FULL_SUMS: [f64; 20] = [
+            0.5,
+            0.75,
+            0.875,
+            0.9375,
+            0.96875,
+            0.984375,
+            0.9921875,
+            0.99609375,
+            0.998046875,
+            0.9990234375,
+            0.99951171875,
+            0.999755859375,
+            0.9998779296875,
+            0.99993896484375,
+            0.999969482421875,
+            0.9999847412109375,
+            0.9999923706054688,
+            0.9999961853027344,
+            0.9999980926513672,
+            0.9999990463256836,
+            // Use the same value beyond this point.
+        ];
+
+        // Fold the full periods in the middle.
+        if seq_delta >= 2 {
+            let idx = ((seq_delta - 2) as usize).min(FULL_SUMS.len() - 1);
+            old += val * FULL_SUMS[idx];
+        }
+
+        // Accumulate the current period duration into @cur.
+        cur += val * normalized_dur(now % half_life);
+    } else {
+        cur += val * normalized_dur(now - val_at);
+    }
+
+    //
+    // The following is the blending part of BPF ravg_read().
+    //
+    old * (1.0 - normalized_dur(now % half_life) / 2.0) + cur / 2.0
+}
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index e7d509bdf34ca..25e48e41b997c 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -35,9 +35,12 @@ use log::trace;
 use log::warn;
 use ordered_float::OrderedFloat;
 
+const RAVG_FRAC_BITS: u32 = rusty_sys::ravg_consts_RAVG_FRAC_BITS;
 const MAX_DOMS: usize = rusty_sys::consts_MAX_DOMS as usize;
 const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize;
 
+include!("../../ravg_read.rs.h");
+
 /// scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
 /// part does simple round robin in each domain and the userspace part
 /// calculates the load factor of each domain and tells the BPF part how to load
@@ -134,78 +137,6 @@ struct Opts {
     verbose: u8,
 }
 
-fn ravg_read(rd: &rusty_sys::ravg_data, now: u64, half_life: u32) -> f64 {
-    const RAVG_1: f64 = (1 << rusty_sys::ravg_consts_RAVG_FRAC_BITS) as f64;
-    let half_life = half_life as u64;
-    let val = rd.val as f64;
-    let val_at = rd.val_at;
-    let mut old = rd.old as f64 / RAVG_1;
-    let mut cur = rd.cur as f64 / RAVG_1;
-
-    let now = now.max(val_at);
-    let normalized_dur = |dur| dur as f64 / half_life as f64;
-
-    //
-    // The following is f64 implementation of BPF ravg_accumulate().
-    //
-    let cur_seq = (now / half_life) as i64;
-    let val_seq = (val_at / half_life) as i64;
-    let seq_delta = (cur_seq - val_seq) as i32;
-
-    if seq_delta > 0 {
-        let full_decay = 2f64.powi(seq_delta);
-
-        // Decay $old and fold $cur into it.
-        old /= full_decay;
-        old += cur / full_decay;
-        cur = 0.0;
-
-        // Fold the oldest period whicy may be partial.
-        old += val * normalized_dur(half_life - val_at % half_life) / full_decay;
-
-        // Pre-computed decayed full-period values.
-        const FULL_SUMS: [f64; 20] = [
-            0.5,
-            0.75,
-            0.875,
-            0.9375,
-            0.96875,
-            0.984375,
-            0.9921875,
-            0.99609375,
-            0.998046875,
-            0.9990234375,
-            0.99951171875,
-            0.999755859375,
-            0.9998779296875,
-            0.99993896484375,
-            0.999969482421875,
-            0.9999847412109375,
-            0.9999923706054688,
-            0.9999961853027344,
-            0.9999980926513672,
-            0.9999990463256836,
-            // Use the same value beyond this point.
-        ];
-
-        // Fold the full periods in the middle.
-        if seq_delta >= 2 {
-            let idx = ((seq_delta - 2) as usize).min(FULL_SUMS.len() - 1);
-            old += val * FULL_SUMS[idx];
-        }
-
-        // Accumulate the current period duration into @cur.
-        cur += val * normalized_dur(now % half_life);
-    } else {
-        cur += val * normalized_dur(now - val_at);
-    }
-
-    //
-    // The following is the blending part of BPF ravg_read().
-    //
-    old * (1.0 - normalized_dur(now % half_life) / 2.0) + cur / 2.0
-}
-
 fn now_monotonic() -> u64 {
     let mut time = libc::timespec {
         tv_sec: 0,
@@ -623,7 +554,16 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx)
                 };
 
-                self.dom_loads[i] = ravg_read(&dom_ctx.load_rd, now_mono, load_half_life);
+                let rd = &dom_ctx.load_rd;
+                self.dom_loads[i] = ravg_read(
+                    rd.val,
+                    rd.val_at,
+                    rd.old,
+                    rd.cur,
+                    now_mono,
+                    load_half_life,
+                    RAVG_FRAC_BITS,
+                );
 
                 load_sum += self.dom_loads[i];
             }
@@ -697,8 +637,17 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                     continue;
                 }
 
-                let load =
-                    task_ctx.weight as f64 * ravg_read(&task_ctx.dcyc_rd, now_mono, load_half_life);
+                let rd = &task_ctx.dcyc_rd;
+                let load = task_ctx.weight as f64
+                    * ravg_read(
+                        rd.val,
+                        rd.val_at,
+                        rd.old,
+                        rd.cur,
+                        now_mono,
+                        load_half_life,
+                        RAVG_FRAC_BITS,
+                    );
 
                 tasks_by_load.insert(
                     OrderedFloat(load),

From e322e5655c67d8b0fca37f35e914de057bf533a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 18:19:59 -1000
Subject: [PATCH 177/304] scx_rusty: Misc update

---
 tools/sched_ext/scx_rusty/src/bpf/rusty.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
index bb40c34f43a07..8a7487cf426c3 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
@@ -13,9 +13,11 @@
 #define __kptr
 #endif
 
+#ifndef __KERNEL__
 typedef unsigned char u8;
 typedef unsigned int u32;
 typedef unsigned long long u64;
+#endif
 
 #include "../../../ravg.bpf.h"
 

From d30e64db91ca51edcb94a1df382d22c14fa63baa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 18:27:06 -1000
Subject: [PATCH 178/304] scx_layered: Build fix after pulling tools/sched_ext
 Makefile change

---
 tools/sched_ext/scx_layered/build.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index 744df9e1e301f..ea0bbd48af825 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -41,8 +41,8 @@ fn bindgen_layered() {
 }
 
 fn gen_bpf_sched(name: &str) {
-    let bpf_cflags = env::var("SCX_LAYERED_BPF_CFLAGS").unwrap();
-    let clang = env::var("SCX_LAYERED_CLANG").unwrap();
+    let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
+    let clang = env::var("SCX_RUST_CLANG").unwrap();
     eprintln!("{}", clang);
     let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
     let skel = Path::new(&outpath);

From 687fe29c41e08d6576b0f733183b12e8a7527482 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 17:07:41 -1000
Subject: [PATCH 179/304] scx_layered: Updates as per David's review

---
 .../scx_layered/src/bpf/layered.bpf.c         | 49 ++++++++++++++-----
 tools/sched_ext/scx_layered/src/bpf/layered.h |  2 +-
 tools/sched_ext/scx_layered/src/main.rs       | 27 +++-------
 3 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
index 1ee597fdf86cb..72dba391cec9e 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -21,7 +21,7 @@ const volatile unsigned char all_cpus[MAX_CPUS_U8];
 private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
 struct layer layers[MAX_LAYERS];
 u32 fallback_cpu;
-u32 preempt_cursor;
+static u32 preempt_cursor;
 
 #define dbg(fmt, args...)	do { if (debug) bpf_printk(fmt, ##args); } while (0)
 #define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
@@ -148,11 +148,17 @@ static void refresh_cpumasks(int idx)
 		u8 *u8_ptr;
 
 		if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) {
+			/*
+			 * XXX - The following test should be outside the loop
+			 * but that makes the verifier think that cont->cpumask
+			 * might be NULL in the loop.
+			 */
 			barrier_var(cont);
 			if (!cont || !cont->cpumask) {
 				scx_bpf_error("can't happen");
 				return;
 			}
+
 			if (*u8_ptr & (1 << (cpu % 8))) {
 				bpf_cpumask_set_cpu(cpu, cont->cpumask);
 				total++;
@@ -311,6 +317,11 @@ static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
 	if (tctx->layer_cpus_seq == layer_seq)
 		return;
 
+	/*
+	 * XXX - We're assuming that the updated @layer_cpumask matching the new
+	 * @layer_seq is visible which may not be true. For now, leave it as-is.
+	 * Let's update once BPF grows enough memory ordering constructs.
+	 */
 	bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr);
 	tctx->layer_cpus_seq = layer_seq;
 	trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
@@ -472,7 +483,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 			continue;
 
 		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+
+		/*
+		 * Round-robining doesn't have to be strict. Let's not bother
+		 * with atomic ops on $preempt_cursor.
+		 */
 		preempt_cursor = (cpu + 1) % nr_possible_cpus;
+
 		lstat_inc(LSTAT_PREEMPT, layer, cctx);
 		break;
 	}
@@ -499,10 +516,8 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 		if (!(layer_cpumask = lookup_layer_cpumask(idx)))
 			return;
 
-		if (bpf_cpumask_test_cpu(cpu, layer_cpumask)) {
-			if (scx_bpf_consume(idx))
-				return;
-		} else if (cpu == fallback_cpu && layer->nr_cpus == 0) {
+		if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
+		    (cpu == fallback_cpu && layer->nr_cpus == 0)) {
 			if (scx_bpf_consume(idx))
 				return;
 		}
@@ -705,13 +720,17 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
 		    const struct cpumask *cpumask)
 {
 	struct task_ctx *tctx;
-	pid_t pid = p->pid;
 
-	if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid)) && all_cpumask)
-		tctx->all_cpus_allowed =
-			bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
-	else
-		scx_bpf_error("missing task_ctx or all_cpumask");
+	if (!(tctx = lookup_task_ctx(p)))
+		return;
+
+	if (!all_cpumask) {
+		scx_bpf_error("NULL all_cpumask");
+		return;
+	}
+
+	tctx->all_cpus_allowed =
+		bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
 s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
@@ -914,6 +933,14 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 		cpumask = bpf_cpumask_create();
 		if (!cpumask)
 			return -ENOMEM;
+
+		/*
+		 * Start all layers with full cpumask so that everything runs
+		 * everywhere. This will soon be updated by refresh_cpumasks()
+		 * once the scheduler starts running.
+		 */
+		bpf_cpumask_setall(cpumask);
+
 		cpumask = bpf_kptr_xchg(&cont->cpumask, cpumask);
 		if (cpumask)
 			bpf_cpumask_release(cpumask);
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/layered.h
index 3191326763b84..bb123a2b4d10c 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.h
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.h
@@ -27,7 +27,7 @@ enum consts {
 	MAX_COMM		= 16,
 	MAX_LAYER_MATCH_ORS	= 32,
 	MAX_LAYERS		= 16,
-	USAGE_HALF_LIFE		= 1 * 100000000,	/* 100ms */
+	USAGE_HALF_LIFE		= 100000000,	/* 100ms */
 
 	/* XXX remove */
 	MAX_CGRP_PREFIXES = 32
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 838ddd2f6fbb0..38175046c618b 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -88,11 +88,11 @@ lazy_static::lazy_static! {
 /// * Tasks which are in the cgroup sub-hierarchy under "system.slice".
 /// * Or tasks whose comm starts with "fbagent" and have a nice value > 0.
 ///
-/// Currenlty, the following matches are supported:
+/// Currently, the following matches are supported:
 ///
 /// * CgroupPrefix: Matches the prefix of the cgroup that the task belongs
 ///   to. As this is a string match, whether the pattern has the trailing
-///   '/' makes difference. For example, "TOP/CHILD/" only matches tasks
+///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
 ///   which are under that particular cgroup while "TOP/CHILD" also matches
 ///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
 ///
@@ -105,7 +105,7 @@ lazy_static::lazy_static! {
 ///   pattern.
 ///
 /// While there are complexity limitations as the matches are performed in
-/// BPF, it is straight-forward to add more types of matches.
+/// BPF, it is straightforward to add more types of matches.
 ///
 /// Policies
 /// ========
@@ -115,8 +115,7 @@ lazy_static::lazy_static! {
 ///   "kind": {
 ///     "Confined": {
 ///       "cpus_range": [1, 8],
-///       "util_range": [0.8, 0.9],
-///       ]
+///       "util_range": [0.8, 0.9]
 ///     }
 ///   }
 ///
@@ -146,7 +145,7 @@ lazy_static::lazy_static! {
 ///   idle CPUs are available.
 ///
 /// Similar to matches, adding new policies and extending existing ones
-/// should be relatively straight-forward.
+/// should be relatively straightforward.
 ///
 /// Configuration example and running scx_layered
 /// =============================================
@@ -255,21 +254,7 @@ struct Opts {
     #[clap(short = 'e', long)]
     example: Option<String>,
 
-    /// Layer specification. An argument should be a string containing one
-    /// specification.
-    ///
-    /// Prefix of cgroup paths whose tasks are in the batch execution layer.
-    /// Tasks in this layer will get the weight-matching CPU cycles but may
-    /// experience higher scheduling latencies.
-    ///
-    /// The paths don't have the leading '/' and may or may not have trailing
-    /// '/'. If there is no trailing '/', the prefix matches any cgroups
-    /// which have matching prefix upto that point.
-    ///
-    /// - "" matches all cgroups.
-    /// - "/" only matches the root cgroup.
-    /// - "workload" matches both "workload/work" and "workload-1/work".
-    /// - "workload/" matches "workload/work" but not "workload-1/work".
+    /// Layer specification. See --help.
     specs: Vec<String>,
 }
 

From ecbff41dcce314833827d361dba2b96e3f7f18c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 17:11:08 -1000
Subject: [PATCH 180/304] scx_layered: Use the common ravg implementation

---
 .../scx_layered/src/bpf/layered.bpf.c         |   2 +-
 .../sched_ext/scx_layered/src/bpf/ravg.bpf.c  | 329 ------------------
 2 files changed, 1 insertion(+), 330 deletions(-)
 delete mode 100644 tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c

diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
index 72dba391cec9e..3d8cdaeb206fe 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -27,7 +27,7 @@ static u32 preempt_cursor;
 #define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
 
 #include "util.bpf.c"
-#include "ravg.bpf.c"
+#include "../../../ravg_impl.bpf.h"
 
 struct user_exit_info uei;
 
diff --git a/tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c b/tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c
deleted file mode 100644
index 91637624fd59b..0000000000000
--- a/tools/sched_ext/scx_layered/src/bpf/ravg.bpf.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/* to be included in the main bpf.c file */
-
-#define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
-//#define RAVG_FN_ATTRS		__attribute__((unused))
-
-/*
- * Running average helpers to be used in BPF progs. Assumes vmlinux.h has
- * already been included.
- */
-enum ravg_consts {
-	RAVG_VAL_BITS		= 44,		/* input values are 44bit */
-	RAVG_FRAC_BITS		= 20,		/* 1048576 is 1.0 */
-};
-
-/*
- * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in
- * arbitrary time intervals. The accumulated values are halved every half_life
- * with each period starting when the current time % half_life is 0. Zeroing is
- * enough for initialization.
- *
- * See ravg_accumulate() and ravg_read() for more details.
- */
-struct ravg_data {
-	/* current value */
-	__u64			val;
-
-	/*
-	 * The timestamp of @val. The latest completed seq #:
-	 *
-	 *   (val_at / half_life) - 1
-	 */
-	__u64			val_at;
-
-	/* running avg as of the latest completed seq  */
-	__u64			old;
-
-	/*
-	 * Accumulated value of the current period. Input value is 48bits and we
-	 * normalize half-life to 16bit, so it should fit in an u64.
-	 */
-	__u64			cur;
-};
-
-static RAVG_FN_ATTRS void ravg_add(__u64 *sum, __u64 addend)
-{
-	__u64 new = *sum + addend;
-
-	if (new >= *sum)
-		*sum = new;
-	else
-		*sum = -1;
-}
-
-static RAVG_FN_ATTRS __u64 ravg_decay(__u64 v, __u32 shift)
-{
-	if (shift >= 64)
-		return 0;
-	else
-		return v >> shift;
-}
-
-static RAVG_FN_ATTRS __u32 ravg_normalize_dur(__u32 dur, __u32 half_life)
-{
-	if (dur < half_life)
-		return (((__u64)dur << RAVG_FRAC_BITS) + half_life - 1) /
-			half_life;
-	else
-		return 1 << RAVG_FRAC_BITS;
-}
-
-/*
- * Pre-computed decayed full-period values. This is quicker and keeps the bpf
- * verifier happy by removing the need for looping.
- *
- * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1)
- * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2)
- * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3)
- * ...
- */
-static __u64 ravg_full_sum[] = {
-	 524288,  786432,  917504,  983040,
-	1015808, 1032192, 1040384, 1044480,
-	1046528, 1047552, 1048064, 1048320,
-	1048448, 1048512, 1048544, 1048560,
-	1048568, 1048572, 1048574, 1048575,
-	/* the same from here on */
-};
-
-static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]);
-
-/**
- * ravg_accumulate - Accumulate a new value
- * @rd: ravg_data to accumulate into
- * @new_val: new value
- * @now: current timestamp
- * @half_life: decay period, must be the same across calls
- *
- * The current value is changing to @val at @now. Accumulate accordingly.
- */
-static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd,
-					  __u64 new_val, __u64 now,
-					  __u32 half_life)
-{
-	__u32 cur_seq, val_seq, seq_delta;
-
-	/*
-	 * It may be difficult for the caller to guarantee monotonic progress if
-	 * multiple CPUs accumulate to the same ravg_data. Handle @now being in
-	 * the past of @rd->val_at.
-	 */
-	if (now < rd->val_at)
-		now = rd->val_at;
-
-	cur_seq = now / half_life;
-	val_seq = rd->val_at / half_life;
-	seq_delta = cur_seq - val_seq;
-
-	/*
-	 * Decay ->old and fold ->cur into it.
-	 *
-	 *                                                          @end
-	 *                                                            v
-	 * timeline     |---------|---------|---------|---------|---------|
-	 * seq delta         4         3         2         1          0
-	 * seq            ->seq                                    cur_seq
-	 * val            ->old     ->cur                  ^
-	 *                   |         |                   |
-	 *                   \---------+------------------/
-	 */
-	if (seq_delta > 0) {
-		/* decay ->old to bring it upto the cur_seq - 1 */
-		rd->old = ravg_decay(rd->old, seq_delta);
-		/* non-zero ->cur must be from val_seq, calc and fold */
-		ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta));
-		/* clear */
-		rd->cur = 0;
-	}
-
-	if (!rd->val)
-		goto out;
-
-	/*
-	 * Accumulate @rd->val between @rd->val_at and @now.
-	 *
-	 *                       @rd->val_at                        @now
-	 *                            v                               v
-	 * timeline     |---------|---------|---------|---------|---------|
-	 * seq delta                  [  3  |    2    |    1    |  0  ]
-	 */
-	if (seq_delta > 0) {
-		__u32 dur;
-
-		/* fold the oldest period which may be partial */
-		dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life);
-		ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta));
-
-		/* fold the full periods in the middle with precomputed vals */
-		if (seq_delta > 1) {
-			__u32 idx = seq_delta - 2;
-
-			if (idx < ravg_full_sum_len)
-				ravg_add(&rd->old, rd->val *
-					 ravg_full_sum[idx]);
-			else
-				ravg_add(&rd->old, rd->val *
-					 ravg_full_sum[ravg_full_sum_len - 2]);
-		}
-
-		/* accumulate the current period duration into ->runtime */
-		rd->cur += rd->val * ravg_normalize_dur(now % half_life,
-							half_life);
-	} else {
-		rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at,
-							half_life);
-	}
-out:
-	if (new_val >= 1LLU << RAVG_VAL_BITS)
-		rd->val = (1LLU << RAVG_VAL_BITS) - 1;
-	else
-		rd->val = new_val;
-	rd->val_at = now;
-}
-
-/**
- * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift)
- * @a: multiplicand
- * @b: multiplier
- * @rshift: number of bits to shift right
- *
- * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is
- * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must
- * ensure that the final shifted result fits in u64.
- */
-static __u64 u64_x_u32_rshift(__u64 a, __u32 b, __u32 rshift)
-{
-	const __u64 mask32 = (__u32)-1;
-	__u64 al = a & mask32;
-	__u64 ah = (a & (mask32 << 32)) >> 32;
-
-	/*
-	 *                                        ah: high 32     al: low 32
-	 * a                                   |--------------||--------------|
-	 *
-	 * ah * b              |--------------||--------------|
-	 * al * b                              |--------------||--------------|
-	 */
-	al *= b;
-	ah *= b;
-
-	/*
-	 * (ah * b) >> rshift        |--------------||--------------|
-	 * (al * b) >> rshift                        |--------------||--------|
-	 *                                                           <-------->
-	 *                                                           32 - rshift
-	 */
-	al >>= rshift;
-	if (rshift <= 32)
-		ah <<= 32 - rshift;
-	else
-		ah >>= rshift - 32;
-
-	return al + ah;
-}
-
-/**
- * ravg_read - Read the current running avg
- * @rd: ravg_data to read from
- * @now: timestamp as of which to read the running avg
- * @half_life: decay period, must match ravg_accumulate()'s
- *
- * Read running avg from @rd as of @now.
- */
-static RAVG_FN_ATTRS __u64 ravg_read(struct ravg_data *rd, __u64 now,
-				     __u64 half_life)
-{
-	struct ravg_data trd;
-	__u32 elapsed = now % half_life;
-
-	/*
-	 * Accumulate the ongoing period into a temporary copy. This allows
-	 * external readers to access up-to-date avg without strongly
-	 * synchronizing with the updater (we need to add a seq lock tho).
-	 */
-	trd = *rd;
-	rd = &trd;
-	ravg_accumulate(rd, 0, now, half_life);
-
-	/*
-	 * At the beginning of a new half_life period, the running avg is the
-	 * same as @rd->old. At the beginning of the next, it'd be old load / 2
-	 * + current load / 2. Inbetween, we blend the two linearly.
-	 */
-	if (elapsed) {
-		__u32 progress = ravg_normalize_dur(elapsed, half_life);
-		/*
-		 * `H` is the duration of the half-life window, and `E` is how
-		 * much time has elapsed in this window. `P` is [0.0, 1.0]
-		 * representing how much the current window has progressed:
-		 *
-		 *   P = E / H
-		 *
-		 * If `old` is @rd->old, we would want to calculate the
-		 * following for blending:
-		 *
-		 *   old * (1.0 - P / 2)
-		 *
-		 * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply
-		 * and then divide by 1 << RAVG_FRAC_BITS:
-		 *
-		 *         (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2
-		 *   old * -----------------------------------------------------
-		 *                       1 << RAVG_FRAC_BITS
-		 *
-		 * As @progress is (1 << RAVG_FRAC_BITS) * P:
-		 *
-		 *         (1 << RAVG_FRAC_BITS) - progress / 2
-		 *   old * ------------------------------------
-		 *                1 << RAVG_FRAC_BITS
-		 *
-		 * As @rd->old uses full 64bit, the multiplication can overflow,
-		 * but we also know that the final result is gonna be smaller
-		 * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle
-		 * the interim multiplication correctly.
-		 */
-		__u64 old = u64_x_u32_rshift(rd->old,
-					(1 << RAVG_FRAC_BITS) - progress / 2,
-					RAVG_FRAC_BITS);
-		/*
-		 * If `S` is the Sum(val * duration) for this half-life window,
-		 * the avg for this window is:
-		 *
-		 *   S / E
-		 *
-		 * We would want to calculate the following for blending:
-		 *
-		 *   S / E * (P / 2)
-		 *
-		 * As P = E / H,
-		 *
-		 *   S / E * (E / H / 2)
-		 *   S / H / 2
-		 *
-		 * Expanding S, the above becomes:
-		 *
-		 *   Sum(val * duration) / H / 2
-		 *   Sum(val * (duration / H)) / 2
-		 *
-		 * As we use RAVG_FRAC_BITS bits for fixed point arithmetic,
-		 * let's multiply the whole result accordingly:
-		 *
-		 *   (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS)
-		 *
-		 *             duration * (1 << RAVG_FRAC_BITS)
-		 *   Sum(val * --------------------------------) / 2
-		 *                            H
-		 *
-		 * The righthand multiplier inside Sum() is the normalized
-		 * duration returned from ravg_normalize_dur(), so, the whole
-		 * Sum term equals @rd->cur.
-		 *
-		 *  rd->cur / 2
-		 */
-		__u64 cur = rd->cur / 2;
-
-		return old + cur;
-	} else {
-		return rd->old;
-	}
-}

From 1ad52c79a8109d3bfa6864befaf56df84091db1c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 17:27:23 -1000
Subject: [PATCH 181/304] scx_layered: Use tp_btf/task_rename instead of
 fentry/__set_task_comm

---
 tools/sched_ext/scx_layered/src/bpf/layered.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
index 3d8cdaeb206fe..e936717708257 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -298,8 +298,8 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 	return 0;
 }
 
-SEC("fentry/__set_task_comm")
-int BPF_PROG(fentry_set_task_comm, struct task_struct *p, const char *buf, bool exec)
+SEC("tp_btf/task_rename")
+int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
 {
 	struct task_ctx *tctx;
 

From 42a1f1ffd03a1522e589e59751a600ca9c372401 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 17:37:35 -1000
Subject: [PATCH 182/304] scx_layered: s/__u[32|64]/u[32|64]/

---
 .../scx_layered/src/bpf/layered.bpf.c          |  2 +-
 tools/sched_ext/scx_layered/src/bpf/layered.h  | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
index e936717708257..7e7a72ae26126 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -38,7 +38,7 @@ static inline bool vtime_before(u64 a, u64 b)
 
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__type(key, __u32);
+	__type(key, u32);
 	__type(value, struct cpu_ctx);
 	__uint(max_entries, 1);
 } cpu_ctxs SEC(".maps");
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/layered.h
index bb123a2b4d10c..bfd7485f97c53 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.h
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.h
@@ -14,8 +14,8 @@
 #endif
 
 #ifndef __KERNEL__
-typedef unsigned long long __u64;
-typedef long long __s64;
+typedef unsigned long long u64;
+typedef long long s64;
 #endif
 
 enum consts {
@@ -50,9 +50,9 @@ enum layer_stat_idx {
 
 struct cpu_ctx {
 	bool			current_preempt;
-	__u64			layer_cycles[MAX_LAYERS];
-	__u64			gstats[NR_GSTATS];
-	__u64			lstats[MAX_LAYERS][NR_LSTATS];
+	u64			layer_cycles[MAX_LAYERS];
+	u64			gstats[NR_GSTATS];
+	u64			lstats[MAX_LAYERS][NR_LSTATS];
 };
 
 enum layer_match_kind {
@@ -83,11 +83,11 @@ struct layer {
 	bool			open;
 	bool			preempt;
 
-	__u64			vtime_now;
-	__u64			nr_tasks;
-	__u64			load_avg;
+	u64			vtime_now;
+	u64			nr_tasks;
+	u64			load_avg;
 
-	__u64			cpus_seq;
+	u64			cpus_seq;
 	unsigned int		refresh_cpus;
 	unsigned char		cpus[MAX_CPUS_U8];
 	unsigned int		nr_cpus;	// managed from BPF side

From d70e2097ab97113c0cf6a403f565b658f1313b33 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 18:40:08 -1000
Subject: [PATCH 183/304] scx_layered: Use rust ravg_read() to read load
 instead of periodically updating from tick

---
 .../scx_layered/src/bpf/layered.bpf.c         | 79 ++++++++++---------
 tools/sched_ext/scx_layered/src/bpf/layered.h |  6 +-
 tools/sched_ext/scx_layered/src/main.rs       | 33 +++++++-
 3 files changed, 75 insertions(+), 43 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
index 7e7a72ae26126..117c04a9e2564 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -81,52 +81,59 @@ static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_c
 		scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx);
 }
 
-static struct layer_load {
-	u64			load;
-	struct ravg_data	ravg_data;
-} layer_loads[MAX_LAYERS];
-
-private(layer_loads) struct bpf_spin_lock layer_loads_lock;
+struct lock_wrapper {
+	struct bpf_spin_lock	lock;
+};
 
-const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct lock_wrapper);
+	__uint(max_entries, MAX_LAYERS);
+	__uint(map_flags, 0);
+} layer_load_locks SEC(".maps");
 
-static void adj_load(u32 layer, s64 adj, u64 now)
+static void adj_load(u32 layer_idx, s64 adj, u64 now)
 {
-	struct layer_load *load = &layer_loads[layer];
+	struct layer *layer;
+	struct lock_wrapper *lockw;
 
-	if (layer >= nr_layers) {
-		scx_bpf_error("invalid layer %u", layer);
+	layer = MEMBER_VPTR(layers, [layer_idx]);
+	lockw = bpf_map_lookup_elem(&layer_load_locks, &layer_idx);
+
+	if (!layer || !lockw) {
+		scx_bpf_error("Can't access layer%d or its load_lock", layer_idx);
 		return;
 	}
 
-	bpf_spin_lock(&layer_loads_lock);
-	load->load += adj;
-	ravg_accumulate(&load->ravg_data, load->load, now, USAGE_HALF_LIFE);
-	bpf_spin_unlock(&layer_loads_lock);
+	bpf_spin_lock(&lockw->lock);
+	layer->load += adj;
+	ravg_accumulate(&layer->load_rd, layer->load, now, USAGE_HALF_LIFE);
+	bpf_spin_unlock(&lockw->lock);
 
-	if (debug && adj < 0 && (s64)load->load < 0)
+	if (debug && adj < 0 && (s64)layer->load < 0)
 		scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)",
-			      bpf_get_smp_processor_id(), layer, load->load, adj);
+			      bpf_get_smp_processor_id(), layer, layer->load, adj);
 }
 
-struct layer_cpumask_container {
+struct layer_cpumask_wrapper {
 	struct bpf_cpumask __kptr *cpumask;
 };
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
-	__type(value, struct layer_cpumask_container);
+	__type(value, struct layer_cpumask_wrapper);
 	__uint(max_entries, MAX_LAYERS);
 	__uint(map_flags, 0);
 } layer_cpumasks SEC(".maps");
 
 static struct cpumask *lookup_layer_cpumask(int idx)
 {
-	struct layer_cpumask_container *cont;
+	struct layer_cpumask_wrapper *cpumaskw;
 
-	if ((cont = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
-		return (struct cpumask *)cont->cpumask;
+	if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
+		return (struct cpumask *)cpumaskw->cpumask;
 	} else {
 		scx_bpf_error("no layer_cpumask");
 		return NULL;
@@ -135,14 +142,14 @@ static struct cpumask *lookup_layer_cpumask(int idx)
 
 static void refresh_cpumasks(int idx)
 {
-	struct layer_cpumask_container *cont;
+	struct layer_cpumask_wrapper *cpumaskw;
 	struct layer *layer;
 	int cpu, total = 0;
 
 	if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0))
 		return;
 
-	cont = bpf_map_lookup_elem(&layer_cpumasks, &idx);
+	cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx);
 
 	bpf_for(cpu, 0, nr_possible_cpus) {
 		u8 *u8_ptr;
@@ -150,20 +157,20 @@ static void refresh_cpumasks(int idx)
 		if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) {
 			/*
 			 * XXX - The following test should be outside the loop
-			 * but that makes the verifier think that cont->cpumask
-			 * might be NULL in the loop.
+			 * but that makes the verifier think that
+			 * cpumaskw->cpumask might be NULL in the loop.
 			 */
-			barrier_var(cont);
-			if (!cont || !cont->cpumask) {
+			barrier_var(cpumaskw);
+			if (!cpumaskw || !cpumaskw->cpumask) {
 				scx_bpf_error("can't happen");
 				return;
 			}
 
 			if (*u8_ptr & (1 << (cpu % 8))) {
-				bpf_cpumask_set_cpu(cpu, cont->cpumask);
+				bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask);
 				total++;
 			} else {
-				bpf_cpumask_clear_cpu(cpu, cont->cpumask);
+				bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask);
 			}
 		} else {
 			scx_bpf_error("can't happen");
@@ -191,12 +198,8 @@ int scheduler_tick_fentry(const void *ctx)
 	if (bpf_get_smp_processor_id() != 0)
 		return 0;
 
-	now = bpf_ktime_get_ns();
-	bpf_for(idx, 0, nr_layers) {
-		layers[idx].load_avg = ravg_read(&layer_loads[idx].ravg_data,
-						 now, USAGE_HALF_LIFE);
+	bpf_for(idx, 0, nr_layers)
 		refresh_cpumasks(idx);
-	}
 	return 0;
 }
 
@@ -919,7 +922,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	}
 
 	bpf_for(i, 0, nr_layers) {
-		struct layer_cpumask_container *cont;
+		struct layer_cpumask_wrapper *cpumaskw;
 
 		layers[i].idx = i;
 
@@ -927,7 +930,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 		if (ret < 0)
 			return ret;
 
-		if (!(cont = bpf_map_lookup_elem(&layer_cpumasks, &i)))
+		if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &i)))
 			return -ENONET;
 
 		cpumask = bpf_cpumask_create();
@@ -941,7 +944,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 		 */
 		bpf_cpumask_setall(cpumask);
 
-		cpumask = bpf_kptr_xchg(&cont->cpumask, cpumask);
+		cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask);
 		if (cpumask)
 			bpf_cpumask_release(cpumask);
 	}
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/layered.h
index bfd7485f97c53..bedfa0650c005 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.h
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.h
@@ -18,6 +18,8 @@ typedef unsigned long long u64;
 typedef long long s64;
 #endif
 
+#include "../../../ravg.bpf.h"
+
 enum consts {
 	MAX_CPUS_SHIFT		= 9,
 	MAX_CPUS		= 1 << MAX_CPUS_SHIFT,
@@ -85,7 +87,9 @@ struct layer {
 
 	u64			vtime_now;
 	u64			nr_tasks;
-	u64			load_avg;
+
+	u64			load;
+	struct ravg_data	load_rd;
 
 	u64			cpus_seq;
 	unsigned int		refresh_cpus;
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 38175046c618b..6e582ae25b9e8 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -37,20 +37,24 @@ use log::trace;
 use serde::Deserialize;
 use serde::Serialize;
 
+const RAVG_FRAC_BITS: u32 = layered_sys::ravg_consts_RAVG_FRAC_BITS;
 const MAX_CPUS: usize = layered_sys::consts_MAX_CPUS as usize;
 const MAX_PATH: usize = layered_sys::consts_MAX_PATH as usize;
 const MAX_COMM: usize = layered_sys::consts_MAX_COMM as usize;
 const MAX_LAYER_MATCH_ORS: usize = layered_sys::consts_MAX_LAYER_MATCH_ORS as usize;
 const MAX_LAYERS: usize = layered_sys::consts_MAX_LAYERS as usize;
-const USAGE_HALF_LIFE: f64 = layered_sys::consts_USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
+const USAGE_HALF_LIFE: u32 = layered_sys::consts_USAGE_HALF_LIFE;
+const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
 const NR_GSTATS: usize = layered_sys::global_stat_idx_NR_GSTATS as usize;
 const NR_LSTATS: usize = layered_sys::layer_stat_idx_NR_LSTATS as usize;
 const NR_LAYER_MATCH_KINDS: usize = layered_sys::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
 const CORE_CACHE_LEVEL: u32 = 2;
 
+include!("../../ravg_read.rs.h");
+
 lazy_static::lazy_static! {
     static ref NR_POSSIBLE_CPUS: usize = libbpf_rs::num_possible_cpus().unwrap();
-    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE as f64);
+    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
 }
 
 /// scx_layered: A highly configurable multi-layer sched_ext scheduler
@@ -312,6 +316,16 @@ struct LayerConfig {
     specs: Vec<LayerSpec>,
 }
 
+fn now_monotonic() -> u64 {
+    let mut time = libc::timespec {
+        tv_sec: 0,
+        tv_nsec: 0,
+    };
+    let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) };
+    assert!(ret == 0);
+    time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64
+}
+
 fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
     reader
         .read_stat()
@@ -490,13 +504,24 @@ struct Stats {
 
 impl Stats {
     fn read_layer_loads(skel: &mut LayeredSkel, nr_layers: usize) -> (f64, Vec<f64>) {
-        let one = skel.rodata().ravg_1;
+	let now_mono = now_monotonic();
         let layer_loads: Vec<f64> = skel
             .bss()
             .layers
             .iter()
             .take(nr_layers)
-            .map(|layer| layer.load_avg as f64 / one as f64)
+            .map(|layer| {
+                let rd = &layer.load_rd;
+                ravg_read(
+                    rd.val,
+                    rd.val_at,
+                    rd.old,
+                    rd.cur,
+                    now_mono,
+                    USAGE_HALF_LIFE,
+                    RAVG_FRAC_BITS,
+                )
+            })
             .collect();
         (layer_loads.iter().sum(), layer_loads)
     }

From 9695b050516cd2204a429eaafaaa2b9a62c7eaef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 18:54:22 -1000
Subject: [PATCH 184/304] scx_layered: Cleanups

---
 tools/sched_ext/scx_layered/src/bpf/layered.bpf.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
index 117c04a9e2564..b0a27f3c71370 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
@@ -113,7 +113,7 @@ static void adj_load(u32 layer_idx, s64 adj, u64 now)
 
 	if (debug && adj < 0 && (s64)layer->load < 0)
 		scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)",
-			      bpf_get_smp_processor_id(), layer, layer->load, adj);
+			      bpf_get_smp_processor_id(), layer_idx, layer->load, adj);
 }
 
 struct layer_cpumask_wrapper {
@@ -192,14 +192,11 @@ static void refresh_cpumasks(int idx)
 SEC("fentry/scheduler_tick")
 int scheduler_tick_fentry(const void *ctx)
 {
-	u64 now;
 	int idx;
 
-	if (bpf_get_smp_processor_id() != 0)
-		return 0;
-
-	bpf_for(idx, 0, nr_layers)
-		refresh_cpumasks(idx);
+	if (bpf_get_smp_processor_id() == 0)
+		bpf_for(idx, 0, nr_layers)
+			refresh_cpumasks(idx);
 	return 0;
 }
 

From d0be8b249559d59960c1399d7e1af9aaf8a3b52d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Nov 2023 19:09:07 -1000
Subject: [PATCH 185/304] scx: s/check_preempt_curr_scx/wakeup_preempt_scx/ to
 match new upstream naming

---
 kernel/sched/ext.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5cd65c18b1675..5224942b930f4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2428,7 +2428,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 				 (struct cpumask *)p->cpus_ptr);
 }
 
-static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
 int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -2626,10 +2626,9 @@ static inline void scx_cgroup_unlock(void) {}
 /*
  * Omitted operations:
  *
- * - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the
- *   task isn't tied to the CPU at that point. Preemption is implemented by
- *   resetting the victim task's slice to 0 and triggering reschedule on the
- *   target CPU.
+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
+ *   isn't tied to the CPU at that point. Preemption is implemented by resetting
+ *   the victim task's slice to 0 and triggering reschedule on the target CPU.
  *
  * - migrate_task_rq: Unncessary as task to cpu mapping is transient.
  *
@@ -2644,7 +2643,7 @@ DEFINE_SCHED_CLASS(ext) = {
 	.yield_task		= yield_task_scx,
 	.yield_to_task		= yield_to_task_scx,
 
-	.check_preempt_curr	= check_preempt_curr_scx,
+	.wakeup_preempt		= wakeup_preempt_scx,
 
 	.pick_next_task		= pick_next_task_scx,
 

From 9dae2334cd2791a2bd212a6f8846d576a264e6c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 21:32:15 -1000
Subject: [PATCH 186/304] scx: CGROUP_WEIGHT_* should be outside CONFIG_CGROUPS

sched_ext needs these consts even when !CGROUPS. They got accidentally moved
back inside CONFIG_CGROUPS through merge resolution.
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d164eaeacfa60..53040f7464c45 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,8 +29,6 @@
 
 struct kernel_clone_args;
 
-#ifdef CONFIG_CGROUPS
-
 /*
  * All weight knobs on the default hierarchy should use the following min,
  * default and max values.  The default value is the logarithmic center of
@@ -40,6 +38,8 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
+#ifdef CONFIG_CGROUPS
+
 enum {
 	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
 	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */

From 7410ecce3d12c9cfcb210b4bc8ffa1a0e96ab9d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 21:35:13 -1000
Subject: [PATCH 187/304] scx: cpu_local_stat_show() doesn't have dependency on
 RT_GROUP_SCHED or EXT_GROUP_SCHED

This was incorrectly fixed after an errant merge resolution. Fix it back.
---
 kernel/sched/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 59fe4a56a526f..e02e4e8c171cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11352,7 +11352,6 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 	return 0;
 }
 
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 static int cpu_local_stat_show(struct seq_file *sf,
 			       struct cgroup_subsys_state *css)
 {
@@ -11370,7 +11369,6 @@ static int cpu_local_stat_show(struct seq_file *sf,
 #endif
 	return 0;
 }
-#endif
 
 #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 
@@ -11563,8 +11561,8 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
 	.css_extra_stat_show = cpu_extra_stat_show,
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	.css_local_stat_show = cpu_local_stat_show,
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	.can_attach	= cpu_cgroup_can_attach,
 #endif
 	.attach		= cpu_cgroup_attach,

From 08e09f3dfd6b0713eb7fec4556422c696a08bbb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 21:48:01 -1000
Subject: [PATCH 188/304] scx: Kill stray check_preempt_cur() prototype

Merge artifact.
---
 kernel/sched/sched.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b29eed8813a0c..32eddb62a96dc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2573,15 +2573,13 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
-
 extern void check_class_changing(struct rq *rq, struct task_struct *p,
 				 const struct sched_class *prev_class);
 extern void check_class_changed(struct rq *rq, struct task_struct *p,
 				const struct sched_class *prev_class,
 				int oldprio);
 
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
 #ifdef CONFIG_PREEMPT_RT
 #define SCHED_NR_MIGRATE_BREAK 8

From 7a001f5fd94466eaa382a3ebf5d31438eefde41b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 21:42:27 -1000
Subject: [PATCH 189/304] scx: s/scx_exit_type/scx_exit_kind/
 s/scx_exit_info\.type/scx_exit_info\.kind/

These are accessed from userspace and "type" is a reserved token in many
modern languages. Let's use "kind" instead.
---
 include/linux/sched/ext.h                     |  4 +--
 kernel/sched/ext.c                            | 36 +++++++++----------
 kernel/sched/ext.h                            |  6 ++--
 tools/sched_ext/scx_layered/src/main.rs       | 14 ++++----
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c |  6 ++--
 tools/sched_ext/scx_rusty/src/main.rs         | 16 ++++-----
 tools/sched_ext/user_exit_info.h              |  6 ++--
 7 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 62b6e1c34d0f1..b20a7620b93d7 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -55,7 +55,7 @@ enum scx_dsq_id_flags {
 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
 };
 
-enum scx_exit_type {
+enum scx_exit_kind {
 	SCX_EXIT_NONE,
 	SCX_EXIT_DONE,
 
@@ -73,7 +73,7 @@ enum scx_exit_type {
  */
 struct scx_exit_info {
 	/* %SCX_EXIT_* - broad category of the exit reason */
-	enum scx_exit_type	type;
+	enum scx_exit_kind	kind;
 	/* textual representation of the above */
 	char			reason[SCX_EXIT_REASON_LEN];
 	/* number of entries in the backtrace */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5224942b930f4..0ec680b0cb6a8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -98,7 +98,7 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
 	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
 
-static atomic_t scx_exit_type = ATOMIC_INIT(SCX_EXIT_DONE);
+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info scx_exit_info;
 
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
@@ -2157,7 +2157,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
 					last_runnable + scx_watchdog_timeout))) {
 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
 
-			scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+			scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
 					   "%s[%d] failed to run for %u.%03us",
 					   p->comm, p->pid,
 					   dur_ms / 1000, dur_ms % 1000);
@@ -2914,24 +2914,24 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
 	const char *reason;
-	int i, cpu, type;
+	int i, cpu, kind;
 
-	type = atomic_read(&scx_exit_type);
+	kind = atomic_read(&scx_exit_kind);
 	while (true) {
 		/*
 		 * NONE indicates that a new scx_ops has been registered since
 		 * disable was scheduled - don't kill the new ops. DONE
 		 * indicates that the ops has already been disabled.
 		 */
-		if (type == SCX_EXIT_NONE || type == SCX_EXIT_DONE)
+		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
 			return;
-		if (atomic_try_cmpxchg(&scx_exit_type, &type, SCX_EXIT_DONE))
+		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
-	switch (type) {
+	switch (kind) {
 	case SCX_EXIT_UNREG:
 		reason = "BPF scheduler unregistered";
 		break;
@@ -2951,7 +2951,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		reason = "<UNKNOWN>";
 	}
 
-	ei->type = type;
+	ei->kind = kind;
 	strlcpy(ei->reason, reason, sizeof(ei->reason));
 
 	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
@@ -3075,7 +3075,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
 
-	if (ei->type >= SCX_EXIT_ERROR) {
+	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
 
 		if (ei->msg[0] == '\0')
@@ -3128,14 +3128,14 @@ static void schedule_scx_ops_disable_work(void)
 		kthread_queue_work(helper, &scx_ops_disable_work);
 }
 
-static void scx_ops_disable(enum scx_exit_type type)
+static void scx_ops_disable(enum scx_exit_kind kind)
 {
 	int none = SCX_EXIT_NONE;
 
-	if (WARN_ON_ONCE(type == SCX_EXIT_NONE || type == SCX_EXIT_DONE))
-		type = SCX_EXIT_ERROR;
+	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
+		kind = SCX_EXIT_ERROR;
 
-	atomic_try_cmpxchg(&scx_exit_type, &none, type);
+	atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
 
 	schedule_scx_ops_disable_work();
 }
@@ -3147,14 +3147,14 @@ static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 
 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
 
-__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
 				       const char *fmt, ...)
 {
 	struct scx_exit_info *ei = &scx_exit_info;
 	int none = SCX_EXIT_NONE;
 	va_list args;
 
-	if (!atomic_try_cmpxchg(&scx_exit_type, &none, type))
+	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
 		return;
 
 	ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1);
@@ -3208,7 +3208,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		     SCX_OPS_DISABLED);
 
 	memset(&scx_exit_info, 0, sizeof(scx_exit_info));
-	atomic_set(&scx_exit_type, SCX_EXIT_NONE);
+	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
 	scx_warned_zero_slice = false;
 
 	atomic_long_set(&scx_nr_rejected, 0);
@@ -3235,7 +3235,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		 * early with success so that the condition is notified through
 		 * ops.exit() like other scx_bpf_error() invocations.
 		 */
-		if (atomic_read(&scx_exit_type) != SCX_EXIT_NONE)
+		if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE)
 			goto err_disable;
 	}
 
@@ -4367,7 +4367,7 @@ void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz)
 		goto out_restore;
 	}
 
-	scx_ops_error_type(SCX_EXIT_ERROR_BPF, "%s", bufs->msg);
+	scx_ops_error_kind(SCX_EXIT_ERROR_BPF, "%s", bufs->msg);
 out_restore:
 	local_irq_restore(flags);
 }
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index e9c699a87770f..27248760f4ccb 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -135,10 +135,10 @@ int scx_check_setscheduler(struct task_struct *p, int policy);
 bool scx_can_stop_tick(struct rq *rq);
 void init_sched_ext_class(void);
 
-__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
 				       const char *fmt, ...);
 #define scx_ops_error(fmt, args...)						\
-	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
+	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
 
 void __scx_notify_pick_next_task(struct rq *rq,
 				 struct task_struct *p,
@@ -174,7 +174,7 @@ static inline void scx_notify_sched_tick(void)
 	if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) {
 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
 
-		scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
 				   "watchdog failed to check in for %u.%03us",
 				   dur_ms / 1000, dur_ms % 1000);
 	}
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 6e582ae25b9e8..90d05568599dc 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -628,16 +628,16 @@ impl Stats {
 
 #[derive(Debug, Default)]
 struct UserExitInfo {
-    exit_type: i32,
+    kind: i32,
     reason: Option<String>,
     msg: Option<String>,
 }
 
 impl UserExitInfo {
     fn read(bpf_uei: &layered_bss_types::user_exit_info) -> Result<Self> {
-        let exit_type = unsafe { std::ptr::read_volatile(&bpf_uei.exit_type as *const _) };
+        let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) };
 
-        let (reason, msg) = if exit_type != 0 {
+        let (reason, msg) = if kind != 0 {
             (
                 Some(
                     unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) }
@@ -659,14 +659,14 @@ impl UserExitInfo {
         };
 
         Ok(Self {
-            exit_type,
+            kind,
             reason,
             msg,
         })
     }
 
     fn exited(bpf_uei: &layered_bss_types::user_exit_info) -> Result<bool> {
-        Ok(Self::read(bpf_uei)?.exit_type != 0)
+        Ok(Self::read(bpf_uei)?.kind != 0)
     }
 
     fn report(&self) -> Result<()> {
@@ -676,11 +676,11 @@ impl UserExitInfo {
             _ => "".into(),
         };
 
-        match self.exit_type {
+        match self.kind {
             0 => Ok(()),
             etype => {
                 if etype != 64 {
-                    bail!("BPF exit_type={} {}", etype, why);
+                    bail!("EXIT: kind={} {}", etype, why);
                 } else {
                     info!("EXIT: {}", why);
                     Ok(())
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index 9286833da571d..e729ef8650239 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -74,7 +74,7 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
 /*
  * Exit info
  */
-int exit_type = SCX_EXIT_NONE;
+int exit_kind = SCX_EXIT_NONE;
 char exit_msg[SCX_EXIT_MSG_LEN];
 
 /*
@@ -828,7 +828,7 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 	if (taskc->dom_active_pids_gen != dap_gen) {
 		u64 idx = __sync_fetch_and_add(&dom_active_pids[dom_id].write_idx, 1) %
 			MAX_DOM_ACTIVE_PIDS;
-		u32 *pidp;
+		s32 *pidp;
 
 		pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]);
 		if (!pidp) {
@@ -1130,7 +1130,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
 void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei)
 {
 	bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg);
-	exit_type = ei->type;
+	exit_kind = ei->kind;
 }
 
 SEC(".struct_ops.link")
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 25e48e41b997c..128e5b9b72b60 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -1146,13 +1146,13 @@ impl<'a> Scheduler<'a> {
         Ok(())
     }
 
-    fn read_bpf_exit_type(&mut self) -> i32 {
-        unsafe { std::ptr::read_volatile(&self.skel.bss().exit_type as *const _) }
+    fn read_bpf_exit_kind(&mut self) -> i32 {
+        unsafe { std::ptr::read_volatile(&self.skel.bss().exit_kind as *const _) }
     }
 
-    fn report_bpf_exit_type(&mut self) -> Result<()> {
+    fn report_bpf_exit_kind(&mut self) -> Result<()> {
         // Report msg if EXT_OPS_EXIT_ERROR.
-        match self.read_bpf_exit_type() {
+        match self.read_bpf_exit_kind() {
             0 => Ok(()),
             etype if etype == 2 => {
                 let cstr = unsafe { CStr::from_ptr(self.skel.bss().exit_msg.as_ptr() as *const _) };
@@ -1160,10 +1160,10 @@ impl<'a> Scheduler<'a> {
                     .to_str()
                     .context("Failed to convert exit msg to string")
                     .unwrap();
-                bail!("BPF exit_type={} msg={}", etype, msg);
+                bail!("BPF exit_kind={} msg={}", etype, msg);
             }
             etype => {
-                info!("BPF exit_type={}", etype);
+                info!("BPF exit_kind={}", etype);
                 Ok(())
             }
         }
@@ -1174,7 +1174,7 @@ impl<'a> Scheduler<'a> {
         let mut next_tune_at = now + self.tune_interval;
         let mut next_sched_at = now + self.sched_interval;
 
-        while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_type() == 0 {
+        while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_kind() == 0 {
             let now = Instant::now();
 
             if now >= next_tune_at {
@@ -1200,7 +1200,7 @@ impl<'a> Scheduler<'a> {
             );
         }
 
-        self.report_bpf_exit_type()
+        self.report_bpf_exit_kind()
     }
 }
 
diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/user_exit_info.h
index 9bb0b46480e78..f0e45bf3c7661 100644
--- a/tools/sched_ext/user_exit_info.h
+++ b/tools/sched_ext/user_exit_info.h
@@ -11,7 +11,7 @@
 #define __USER_EXIT_INFO_H
 
 struct user_exit_info {
-	int		exit_type;
+	int		kind;
 	char		reason[128];
 	char		msg[1024];
 };
@@ -27,7 +27,7 @@ static inline void uei_record(struct user_exit_info *uei,
 	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
 	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
 	/* use __sync to force memory barrier */
-	__sync_val_compare_and_swap(&uei->exit_type, uei->exit_type, ei->type);
+	__sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind);
 }
 
 #else	/* !__bpf__ */
@@ -35,7 +35,7 @@ static inline void uei_record(struct user_exit_info *uei,
 static inline bool uei_exited(struct user_exit_info *uei)
 {
 	/* use __sync to force memory barrier */
-	return __sync_val_compare_and_swap(&uei->exit_type, -1, -1);
+	return __sync_val_compare_and_swap(&uei->kind, -1, -1);
 }
 
 static inline void uei_print(const struct user_exit_info *uei)

From 2c21348e5d89f78a080a5a2dcf5f986ac0b9f7b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 22:13:08 -1000
Subject: [PATCH 190/304] scx: tools/sched_ext/Makefile updates

* Remove duplicate target lists. c-sched-targets and rust-sched-targets are
  the source of truth now.

* Drop fullclean target. It's unexpected and unnecessary to have a target
  which steps up and cleans.

* Minor formatting updates.
---
 tools/sched_ext/Makefile | 135 +++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 68 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 1b36ab19aaa10..0ed992dce44fd 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -4,6 +4,8 @@ include ../build/Build.include
 include ../scripts/Makefile.arch
 include ../scripts/Makefile.include
 
+all: all_targets
+
 ifneq ($(LLVM),)
 ifneq ($(filter %/,$(LLVM)),)
 LLVM_PREFIX := $(LLVM)
@@ -126,10 +128,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_simple scx_qmap scx_central scx_pair scx_flatcg scx_userland scx_rusty scx_layered
-
 # sort removes libbpf duplicates when not cross-building
-MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf			\
 	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
 	       $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
 
@@ -184,7 +184,8 @@ SCX_COMMON_DEPS := user_exit_info.h scx_user_common.h | $(BINDIR)
 ################
 # C schedulers #
 ################
-c-sched-targets = scx_qmap scx_simple scx_central scx_pair scx_flatcg scx_userland
+c-sched-targets = scx_simple scx_qmap scx_central scx_pair scx_flatcg		\
+		  scx_userland
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
@@ -222,8 +223,8 @@ $(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
 	$(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@
 
 install: all
-	$(Q)mkdir -p $(DESTDIR)/usr/bin/
-	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/bin/
+	$(Q)mkdir -p $(DESTDIR)/usr/local/bin/
+	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/
 
 clean:
 	$(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;)
@@ -231,69 +232,67 @@ clean:
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f $(c-sched-targets)
 
-fullclean: clean
-	$(Q)$(MAKE) -sC ../../ clean
-
 help:
-	@echo  'Building targets:'
-	@echo  '  all		  - Compile all schedulers'
-	@echo  ''
-	@echo  'Alternatively, you may compile individual schedulers:'
-	@echo  '  scx_simple'
-	@echo  '  scx_qmap'
-	@echo  '  scx_central'
-	@echo  '  scx_pair'
-	@echo  '  scx_flatcg'
-	@echo  '  scx_userland'
-	@echo  '  scx_rusty'
-	@echo  ''
-	@echo  'For any scheduler build target, you may specify an alternative'
-	@echo  'build output path with the O= environment variable. For example:'
-	@echo  ''
-	@echo  '   O=/tmp/sched_ext make all'
-	@echo  ''
-	@echo  'will compile all schedulers, and emit the build artifacts to'
-	@echo  '/tmp/sched_ext/build.'
-	@echo  ''
-	@echo  ''
-	@echo  'Rust schedulers:'
-	@echo  '  scx_rusty	  - Build the scx_rusty load balancing scheduler.'
-	@echo  '  scx_rusty_deps  - Download the scx_rusty scheduler cargo dependencies.'
-	@echo  ''
-	@echo  'For any cargo rust schedulers built with cargo, you can specify'
-	@echo  'CARGO_OFFLINE=1 to ensure the build portion does not access the'
-	@echo  'network (e.g. if the scheduler is being packaged).'
-	@echo  ''
-	@echo  'For such use cases, the build workflow will look something like this:'
-	@echo  ''
-	@echo  '   make scx_rusty_deps'
-	@echo  '   CARGO_OFFLINE=1 make scx_rusty'
-	@echo  ''
-	@echo  'If network access during build is allowed, you can just make scx_rusty'
-	@echo  'directly without CARGO_OFFLINE, and dependencies will be downloaded'
-	@echo  'during the build step.'
-	@echo  ''
-	@echo  ''
-	@echo  'Installing targets:'
-	@echo  '  install	  - Compile and install all schedulers to /usr/bin.'
-	@echo  '		    You may specify the DESTDIR= environment variable'
-	@echo  '		    to indicate a prefix for /usr/bin. For example:'
-	@echo  ''
-	@echo  '   DESTDIR=/tmp/sched_ext make install'
-	@echo  ''
-	@echo  '		    will build the schedulers in CWD/build, and'
-	@echo  '		    install the schedulers to /tmp/sched_ext/usr/bin.'
-	@echo  ''
-	@echo  ''
-	@echo  'Cleaning targets:'
-	@echo  '  clean		  - Remove all generated files, including intermediate'
-	@echo  '                    rust files for rust schedulers.'
-	@echo  ''
-	@echo  '  fullclean	  - Remove all generated files, including intermediate'
-	@echo  '                    rust files for rust schedulers, and also trigger a'
-	@echo  '                    clean of the kernel at the root of the whole repository.'
-
-.PHONY: all $(c-sched-targets) $(rust-sched-targets) clean fullclean help
+	@echo   'Building targets'
+	@echo   '================'
+	@echo   ''
+	@echo   '  all		  - Compile all schedulers'
+	@echo   ''
+	@echo   'Alternatively, you may compile individual schedulers:'
+	@echo   ''
+	@printf '  %s\n' $(c-sched-targets) $(rust-sched-targets)
+	@echo   ''
+	@echo   'For any scheduler build target, you may specify an alternative'
+	@echo   'build output path with the O= environment variable. For example:'
+	@echo   ''
+	@echo   '   O=/tmp/sched_ext make all'
+	@echo   ''
+	@echo   'will compile all schedulers, and emit the build artifacts to'
+	@echo   '/tmp/sched_ext/build.'
+	@echo   ''
+	@echo   ''
+	@echo   'Rust scheduler targets'
+	@echo   '======================'
+	@echo   ''
+	@printf '  %s\n' $(rust-sched-targets)
+	@printf '  %s_deps\n' $(rust-sched-targets)
+	@echo   ''
+	@echo   'For any rust schedulers built with cargo, you can specify'
+	@echo   'CARGO_OFFLINE=1 to ensure the build portion does not access the'
+	@echo   'network (e.g. if the scheduler is being packaged).'
+	@echo   ''
+	@echo   'For such use cases, the build workflow will look something like this:'
+	@echo   ''
+	@echo   '   make scx_rusty_deps'
+	@echo   '   CARGO_OFFLINE=1 make scx_rusty'
+	@echo   ''
+	@echo   'If network access during build is allowed, you can just make scx_rusty'
+	@echo   'directly without CARGO_OFFLINE, and dependencies will be downloaded'
+	@echo   'during the build step.'
+	@echo   ''
+	@echo   ''
+	@echo   'Installing targets'
+	@echo   '=================='
+	@echo   ''
+	@echo   '  install	  - Compile and install all schedulers to /usr/bin.'
+	@echo   '		    You may specify the DESTDIR= environment variable'
+	@echo   '		    to indicate a prefix for /usr/bin. For example:'
+	@echo   ''
+	@echo   '                     DESTDIR=/tmp/sched_ext make install'
+	@echo   ''
+	@echo   '		    will build the schedulers in CWD/build, and'
+	@echo   '		    install the schedulers to /tmp/sched_ext/usr/bin.'
+	@echo   ''
+	@echo   ''
+	@echo   'Cleaning targets'
+	@echo   '================'
+	@echo   ''
+	@echo   '  clean		  - Remove all generated files, including intermediate'
+	@echo   '                    rust files for rust schedulers.'
+
+all_targets: $(c-sched-targets) $(rust-sched-targets)
+
+.PHONY: all all_targets $(c-sched-targets) $(rust-sched-targets) clean help
 
 # delete failed targets
 .DELETE_ON_ERROR:

From dde311ca84556f1e319440e6beb491fdd2c708fd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 22:22:14 -1000
Subject: [PATCH 191/304] scx: Reorder tools/sched_ext/README.md

To match patch / Makefile order.
---
 tools/sched_ext/README.md | 163 +++++++++++++++++++-------------------
 1 file changed, 82 insertions(+), 81 deletions(-)

diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index e030b1931ac26..f809322a031fd 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -33,19 +33,18 @@ is actively working on adding a BPF backend compiler as well, but are still
 missing some features such as BTF type tags which are necessary for using
 kptrs.
 
-2. rust >= 1.70.0
+2. pahole >= 1.25
 
-scx_rusty's user space load balancing component is written in Rust, and uses
-features present in the rust toolchain >= 1.70.0. You should be able to use the
-stable build from rustup, but if that doesn't work, try using the rustup
-nightly build.
+You may need pahole in order to generate BTF from DWARF.
 
-There are other requirements as well, such as make, but these are the main /
-non-trivial ones.
+3. rust >= 1.70.0
 
-3. pahole >= 1.25
+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You
+should be able to use the stable build from rustup, but if that doesn't
+work, try using the rustup nightly build.
 
-You may need pahole in order to generate BTF from DWARF.
+There are other requirements as well, such as make, but these are the main /
+non-trivial ones.
 
 ## Compiling the kernel
 
@@ -162,32 +161,46 @@ schedulers.
 
 --------------------------------------------------------------------------------
 
-## Rusty
+## scx_simple
 
 ### Overview
 
-A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
-scheduler does a simple round robin in each domain, and the user space portion
-(written in Rust) calculates the load factor of each domain, and informs BPF of
-how tasks should be load balanced accordingly.
+A simple scheduler that provides an example of a minimal sched_ext
+scheduler. scx_simple can be run in either global weighted vtime mode, or
+FIFO mode.
 
 ### Typical Use Case
 
-Rusty is designed to be flexible, and accommodate different architectures and
-workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
-as well as how Rusty should partition the system into scheduling domains, can
-be tuned to achieve the optimal configuration for any given system or workload.
+Though very simple, this scheduler should perform reasonably well on
+single-socket CPUs with a uniform L3 cache topology. Note that while running in
+global FIFO mode may work well for some workloads, saturating threads can
+easily drown out inactive ones.
 
 ### Production Ready?
 
-Yes. If tuned correctly, rusty should be performant across various CPU
-architectures and workloads. Rusty by default creates a separate scheduling
-domain per-LLC, so its default configuration may be performant as well.
+This scheduler could be used in a production environment, assuming the hardware
+constraints enumerated above, and assuming the workload can accommodate a
+simple scheduling policy.
 
-That said, you may run into an issue with infeasible weights, where a task with
-a very high weight may cause the scheduler to incorrectly leave cores idle
-because it thinks they're necessary to accommodate the compute for a single
-task. This can also happen in CFS, and should soon be addressed for rusty.
+--------------------------------------------------------------------------------
+
+## scx_qmap
+
+### Overview
+
+Another simple, yet slightly more complex scheduler that provides an example of
+a basic weighted FIFO queuing policy. It also provides examples of some common
+useful BPF features, such as sleepable per-task storage allocation in the
+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
+enqueue tasks. It also illustrates how core-sched support could be implemented.
+
+### Typical Use Case
+
+Purely used to illustrate sched_ext features.
+
+### Production Ready?
+
+No
 
 --------------------------------------------------------------------------------
 
@@ -216,31 +229,6 @@ and does not yet have any kind of priority mechanism.
 
 --------------------------------------------------------------------------------
 
-## scx_flatcg
-
-### Overview
-
-A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
-weight-based cgroup CPU control by flattening the cgroup hierarchy into a
-single layer, by compounding the active weight share at each level. The effect
-of this is a much more performant CPU controller, which does not need to
-descend down cgroup trees in order to properly compute a cgroup's share.
-
-### Typical Use Case
-
-This scheduler could be useful for any typical workload requiring a CPU
-controller, but which cannot tolerate the higher overheads of the fair CPU
-controller.
-
-### Production Ready?
-
-Yes, though the scheduler (currently) does not adequately accommodate
-thundering herds of cgroups. If, for example, many cgroups which are nested
-behind a low-priority cgroup were to wake up around the same time, they may be
-able to consume more CPU cycles than they are entitled to.
-
---------------------------------------------------------------------------------
-
 ## scx_pair
 
 ### Overview
@@ -263,46 +251,28 @@ No
 
 --------------------------------------------------------------------------------
 
-## scx_qmap
-
-### Overview
-
-Another simple, yet slightly more complex scheduler that provides an example of
-a basic weighted FIFO queuing policy. It also provides examples of some common
-useful BPF features, such as sleepable per-task storage allocation in the
-`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
-enqueue tasks. It also illustrates how core-sched support could be implemented.
-
-### Typical Use Case
-
-Purely used to illustrate sched_ext features.
-
-### Production Ready?
-
-No
-
---------------------------------------------------------------------------------
-
-## scx_simple
+## scx_flatcg
 
 ### Overview
 
-A simple scheduler that provides an example of a minimal sched_ext
-scheduler. scx_simple can be run in either global weighted vtime mode, or
-FIFO mode.
+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
+weight-based cgroup CPU control by flattening the cgroup hierarchy into a
+single layer, by compounding the active weight share at each level. The effect
+of this is a much more performant CPU controller, which does not need to
+descend down cgroup trees in order to properly compute a cgroup's share.
 
 ### Typical Use Case
 
-Though very simple, this scheduler should perform reasonably well on
-single-socket CPUs with a uniform L3 cache topology. Note that while running in
-global FIFO mode may work well for some workloads, saturating threads can
-easily drown out inactive ones.
+This scheduler could be useful for any typical workload requiring a CPU
+controller, but which cannot tolerate the higher overheads of the fair CPU
+controller.
 
 ### Production Ready?
 
-This scheduler could be used in a production environment, assuming the hardware
-constraints enumerated above, and assuming the workload can accommodate a
-simple scheduling policy.
+Yes, though the scheduler (currently) does not adequately accommodate
+thundering herds of cgroups. If, for example, many cgroups which are nested
+behind a low-priority cgroup were to wake up around the same time, they may be
+able to consume more CPU cycles than they are entitled to.
 
 --------------------------------------------------------------------------------
 
@@ -337,6 +307,37 @@ less performant than just using something like `scx_simple`. It is purely
 meant to illustrate that it's possible to build a user space scheduler on
 top of sched_ext.
 
+--------------------------------------------------------------------------------
+
+## scx_rusty
+
+### Overview
+
+A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
+scheduler does a simple round robin in each domain, and the user space portion
+(written in Rust) calculates the load factor of each domain, and informs BPF of
+how tasks should be load balanced accordingly.
+
+### Typical Use Case
+
+Rusty is designed to be flexible, and accommodate different architectures and
+workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
+as well as how Rusty should partition the system into scheduling domains, can
+be tuned to achieve the optimal configuration for any given system or workload.
+
+### Production Ready?
+
+Yes. If tuned correctly, rusty should be performant across various CPU
+architectures and workloads. Rusty by default creates a separate scheduling
+domain per-LLC, so its default configuration may be performant as well.
+
+That said, you may run into an issue with infeasible weights, where a task with
+a very high weight may cause the scheduler to incorrectly leave cores idle
+because it thinks they're necessary to accommodate the compute for a single
+task. This can also happen in CFS, and should soon be addressed for rusty.
+
+--------------------------------------------------------------------------------
+
 # Troubleshooting
 
 There are a number of common issues that you may run into when building the

From 2e58977ff00c1f437158b46373ce54d98757aaf6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 22:28:13 -1000
Subject: [PATCH 192/304] scx: Rename scx_user_common.h to scx_common.h and
 include user_exit_info.h

---
 tools/sched_ext/Makefile                            | 2 +-
 tools/sched_ext/scx_central.c                       | 3 +--
 tools/sched_ext/{scx_user_common.h => scx_common.h} | 8 +++++---
 tools/sched_ext/scx_flatcg.c                        | 3 +--
 tools/sched_ext/scx_pair.c                          | 3 +--
 tools/sched_ext/scx_qmap.c                          | 3 +--
 tools/sched_ext/scx_simple.c                        | 3 +--
 tools/sched_ext/scx_userland.c                      | 3 +--
 8 files changed, 12 insertions(+), 16 deletions(-)
 rename tools/sched_ext/{scx_user_common.h => scx_common.h} (93%)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 0ed992dce44fd..94985639b299f 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -179,7 +179,7 @@ $(INCLUDE_DIR)/%.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOO
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $@
 	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
 
-SCX_COMMON_DEPS := user_exit_info.h scx_user_common.h | $(BINDIR)
+SCX_COMMON_DEPS := scx_common.h user_exit_info.h | $(BINDIR)
 
 ################
 # C schedulers #
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index b77a15ac6f5b9..1e298590091f9 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -11,9 +11,8 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "user_exit_info.h"
+#include "scx_common.h"
 #include "scx_central.skel.h"
-#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A central FIFO sched_ext scheduler.\n"
diff --git a/tools/sched_ext/scx_user_common.h b/tools/sched_ext/scx_common.h
similarity index 93%
rename from tools/sched_ext/scx_user_common.h
rename to tools/sched_ext/scx_common.h
index d5b7ce48cd6d7..0e93d6b697b8f 100644
--- a/tools/sched_ext/scx_user_common.h
+++ b/tools/sched_ext/scx_common.h
@@ -4,13 +4,15 @@
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  */
-#ifndef __SCHED_EXT_USER_COMMON_H
-#define __SCHED_EXT_USER_COMMON_H
+#ifndef __SCHED_EXT_COMMON_H
+#define __SCHED_EXT_COMMON_H
 
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "user_exit_info.h"
+
 #ifdef __KERNEL__
 #error "Should not be included by BPF programs"
 #endif
@@ -54,4 +56,4 @@
 			bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \
 	} while (0)
 
-#endif	/* __SCHED_EXT_USER_COMMON_H */
+#endif	/* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index fbe93083fe641..f824c4b3444aa 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -12,10 +12,9 @@
 #include <fcntl.h>
 #include <time.h>
 #include <bpf/bpf.h>
-#include "user_exit_info.h"
+#include "scx_common.h"
 #include "scx_flatcg.h"
 #include "scx_flatcg.skel.h"
-#include "scx_user_common.h"
 
 #ifndef FILEID_KERNFS
 #define FILEID_KERNFS		0xfe
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 9e6f3109653c2..48344af0312f5 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -9,10 +9,9 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "user_exit_info.h"
+#include "scx_common.h"
 #include "scx_pair.h"
 #include "scx_pair.skel.h"
-#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index d275adecdc44f..edc3d0a4e8000 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,8 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "user_exit_info.h"
+#include "scx_common.h"
 #include "scx_qmap.skel.h"
-#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A simple five-level FIFO queue sched_ext scheduler.\n"
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 5cca991f57889..900f1c3e74ab5 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -9,9 +9,8 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "user_exit_info.h"
+#include "scx_common.h"
 #include "scx_simple.skel.h"
-#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A simple sched_ext scheduler.\n"
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index d393237818861..a750f10df062d 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -27,10 +27,9 @@
 #include <sys/queue.h>
 #include <sys/syscall.h>
 
-#include "user_exit_info.h"
+#include "scx_common.h"
 #include "scx_userland.h"
 #include "scx_userland.skel.h"
-#include "scx_user_common.h"
 
 const char help_fmt[] =
 "A minimal userland sched_ext scheduler.\n"

From 0b2403f46b9182e1ed6d47a8598e8975eb1329a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 22:29:37 -1000
Subject: [PATCH 193/304] scx: Minor formatting updates and scx_bpf_error()
 formatting string param type update

---
 tools/sched_ext/scx_layered/src/main.rs       | 8 ++------
 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 3 ++-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 90d05568599dc..7eb2edf53661f 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -504,7 +504,7 @@ struct Stats {
 
 impl Stats {
     fn read_layer_loads(skel: &mut LayeredSkel, nr_layers: usize) -> (f64, Vec<f64>) {
-	let now_mono = now_monotonic();
+        let now_mono = now_monotonic();
         let layer_loads: Vec<f64> = skel
             .bss()
             .layers
@@ -658,11 +658,7 @@ impl UserExitInfo {
             (None, None)
         };
 
-        Ok(Self {
-            kind,
-            reason,
-            msg,
-        })
+        Ok(Self { kind, reason, msg })
     }
 
     fn exited(bpf_uei: &layered_bss_types::user_exit_info) -> Result<bool> {
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
index e729ef8650239..7a8b27ceae054 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
@@ -832,7 +832,8 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
 
 		pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]);
 		if (!pidp) {
-			scx_bpf_error("dom_active_pids[%u][%u] indexing failed", dom_id, idx);
+			scx_bpf_error("dom_active_pids[%u][%llu] indexing failed",
+				      dom_id, idx);
 			return;
 		}
 

From 39b906ecd7cf951597f1b9e9042f5a0cf9bb1c92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 22:46:00 -1000
Subject: [PATCH 194/304] scx: whitespace update

---
 tools/sched_ext/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index f809322a031fd..8e7194ada3310 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -111,7 +111,7 @@ void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
 // vmlinux.h provides the definition for struct sched_ext_ops.
 SEC(".struct_ops.link")
 struct sched_ext_ops example_ops {
-	.enable	= (void *)example_enable,			
+	.enable	= (void *)example_enable,
 	.name	= "example",
 }
 ```

From 725cfa39ae80729b735212428372fb33ab27ce47 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Nov 2023 23:57:56 -1000
Subject: [PATCH 195/304] scx_rusty: doc comment update

---
 tools/sched_ext/scx_rusty/src/main.rs | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 128e5b9b72b60..0075b6c0848ec 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -41,20 +41,21 @@ const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize;
 
 include!("../../ravg_read.rs.h");
 
-/// scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
-/// part does simple round robin in each domain and the userspace part
-/// calculates the load factor of each domain and tells the BPF part how to load
-/// balance the domains.
+/// scx_rusty: A multi-domain BPF / userspace hybrid scheduler
+///
+/// The BPF part does simple vtime or round robin scheduling in each domain
+/// and the userspace part calculates the load factor of each domain and
+/// tells the BPF part how to load balance the domains.
 ///
 /// This scheduler demonstrates dividing scheduling logic between BPF and
-/// userspace and using rust to build the userspace part. An earlier variant of
-/// this scheduler was used to balance across six domains, each representing a
-/// chiplet in a six-chiplet AMD processor, and could match the performance of
-/// production setup using CFS.
+/// userspace and using rust to build the userspace part. An earlier variant
+/// of this scheduler was used to balance across six domains, each
+/// representing a chiplet in a six-chiplet AMD processor, and could match
+/// the performance of production setup using CFS.
 ///
-/// WARNING: scx_rusty currently assumes that all domains have equal processing
-/// power and at similar distances from each other. This limitation will be
-/// removed in the future.
+/// WARNING: scx_rusty currently assumes that all domains have equal
+/// processing power and at similar distances from each other. This
+/// limitation will be removed in the future.
 #[derive(Debug, Parser)]
 struct Opts {
     /// Scheduling slice duration in microseconds.

From ea98edfb601bf0546f70f9fa006f54a4783c590f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Nov 2023 06:10:43 -1000
Subject: [PATCH 196/304] scx: Update print_scx_info() comment

---
 kernel/sched/ext.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0ec680b0cb6a8..983694018424b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3692,13 +3692,12 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
  * @log_lvl: the log level to use when printing
  * @p: target task
  *
- * If @task is running on a sched_ext scheduler, print out the name of the
- * sched_ext scheduler, and other various scheduler-related debugging
- * information about the task.
+ * If a sched_ext scheduler is enabled, print the name and state of the
+ * scheduler. If @p is on sched_ext, print further information about the task.
  *
- * This function can be safely called on any task as long as the
- * task_struct itself is accessible. While safe, this function isn't
- * synchronized and may print out mixups or garbages of limited length.
+ * This function can be safely called on any task as long as the task_struct
+ * itself is accessible. While safe, this function isn't synchronized and may
+ * print out mixups or garbages of limited length.
  */
 void print_scx_info(const char *log_lvl, struct task_struct *p)
 {

From f23fbab72668cc9c739896fef4b134f5c8582b32 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Nov 2023 00:13:57 -1000
Subject: [PATCH 197/304] scx: Update print_scx_info()

- p->scx.runnable_at is in jiffies and rq->clock is in ktime ns. Subtracting
  the two doesn't yield anything useful. Also, it's more intuitive for
  negative delta to represent past. Fix delta calculation.

- ops_state is always 0 for running tasks. Let's skip it for now.

- Use return value from copy_from_kernel_nofault() to determine whether the
  read was successful and clearly report read failures.

- scx_enabled() is always nested inside scx_ops_enable_state() != DISABLED.
  Let's just test the latter.
---
 kernel/sched/ext.c | 42 +++++++++++++-----------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0ec680b0cb6a8..29ef77bf6858d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3702,51 +3702,35 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
  */
 void print_scx_info(const char *log_lvl, struct task_struct *p)
 {
-	struct sched_class *class = NULL;
 	enum scx_ops_enable_state state = scx_ops_enable_state();
-	s64 delta = 0;
-	long ops_state = 0;
-	int task_cpu;
-	struct thread_info *thread_info;
 	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
+	char runnable_at_buf[22] = "?";
+	struct sched_class *class;
+	unsigned long runnable_at;
 
-	if (!scx_enabled() || state == SCX_OPS_DISABLED)
+	if (state == SCX_OPS_DISABLED)
 		return;
 
 	/*
 	 * Carefully check if the task was running on sched_ext, and then
 	 * carefully copy the time it's been runnable, and its state.
 	 */
-	copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class));
-	if (!class || class != &ext_sched_class) {
-		printk("%ssched_ext: %s (%s%s)", log_lvl, scx_ops.name,
+	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
+	    class != &ext_sched_class) {
+		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
 		       scx_ops_enable_state_str[state], all);
 		return;
 	}
 
-	copy_from_kernel_nofault(&thread_info, task_thread_info(p),
-				 sizeof(thread_info));
-	copy_from_kernel_nofault(&task_cpu, &thread_info->cpu,
-				 sizeof(task_cpu));
-	if (ops_cpu_valid(task_cpu)) {
-		struct rq *task_rq;
-		u64 rq_clock;
-		unsigned long runnable_at;
-
-		task_rq = cpu_rq(task_cpu);
-		copy_from_kernel_nofault(&rq_clock, &task_rq->clock,
-					 sizeof(rq_clock));
-		copy_from_kernel_nofault(&ops_state, &p->scx.ops_state.counter,
-					 sizeof(ops_state));
-		copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
-					 sizeof(runnable_at));
-		delta = rq_clock - runnable_at;
-	}
+	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
+				      sizeof(runnable_at)))
+		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms",
+			  (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC));
 
 	/* Print everything onto one line to conserve console spce. */
-	printk("%ssched_ext: %s (%s%s), task: runnable_at=%+lld state=%#lx",
+	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
 	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
-	       delta, ops_state);
+	       runnable_at_buf);
 }
 
 void __init init_sched_ext_class(void)

From b7e14192e66093a39fce94fc4c8dcb7b0da3dc03 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Nov 2023 06:13:52 -1000
Subject: [PATCH 198/304] rusty: Improve overview documentation as suggested by
 Josh Don

---
 tools/sched_ext/scx_rusty/src/main.rs | 33 +++++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 0075b6c0848ec..3b0bcd742e054 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -44,14 +44,33 @@ include!("../../ravg_read.rs.h");
 /// scx_rusty: A multi-domain BPF / userspace hybrid scheduler
 ///
 /// The BPF part does simple vtime or round robin scheduling in each domain
-/// and the userspace part calculates the load factor of each domain and
-/// tells the BPF part how to load balance the domains.
+/// while tracking average load of each domain and duty cycle of each task.
 ///
-/// This scheduler demonstrates dividing scheduling logic between BPF and
-/// userspace and using rust to build the userspace part. An earlier variant
-/// of this scheduler was used to balance across six domains, each
-/// representing a chiplet in a six-chiplet AMD processor, and could match
-/// the performance of production setup using CFS.
+/// The userspace part performs two roles. First, it makes higher frequency
+/// (100ms) tuning decisions. It identifies CPUs which are not too heavily
+/// loaded and mark them so that they can pull tasks from other overloaded
+/// domains on the fly.
+///
+/// Second, it drives lower frequency (2s) load balancing. It determines
+/// whether load balancing is necessary by comparing domain load averages.
+/// If there are large enough load differences, it examines upto 1024
+/// recently active tasks on the domain to determine which should be
+/// migrated.
+///
+/// The overhead of userspace operations is low. Load balancing is not
+/// performed frequently but work-conservation is still maintained through
+/// tuning and greedy execution. Load balancing itself is not that expensive
+/// either. It only accesses per-domain load metrics to determine the
+/// domains that need load balancing and limited number of per-task metrics
+/// for each pushing domain.
+///
+/// An earlier variant of this scheduler was used to balance across six
+/// domains, each representing a chiplet in a six-chiplet AMD processor, and
+/// could match the performance of production setup using CFS.
+///
+/// WARNING: Very high weight (low nice value) tasks can throw off load
+/// balancing due to infeasible weight problem. This problem will be solved
+/// in the near future.
 ///
 /// WARNING: scx_rusty currently assumes that all domains have equal
 /// processing power and at similar distances from each other. This

From ca712f89dd260962929925eaeb76a7634a714e32 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Nov 2023 09:08:08 -1000
Subject: [PATCH 199/304] scx: Move scx_ops_enable_state_str[] outside
 CONFIG_SCHED_DEBUG

The new print_scx_info() uses scx_ops_enable_state_str[] outside
CONFIG_SCHED_DEBUG. Let's relocated it outside of CONFIG_SCHED_DEBUG and to
the top.

Reported-by: Changwoo Min <changwoo@igalia.com>
Reported-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1c23938c3dfbb..ebe295eb78de4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -23,6 +23,14 @@ enum scx_ops_enable_state {
 	SCX_OPS_DISABLED,
 };
 
+static const char *scx_ops_enable_state_str[] = {
+	[SCX_OPS_PREPPING]	= "prepping",
+	[SCX_OPS_ENABLING]	= "enabling",
+	[SCX_OPS_ENABLED]	= "enabled",
+	[SCX_OPS_DISABLING]	= "disabling",
+	[SCX_OPS_DISABLED]	= "disabled",
+};
+
 /*
  * sched_ext_entity->ops_state
  *
@@ -3407,14 +3415,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 }
 
 #ifdef CONFIG_SCHED_DEBUG
-static const char *scx_ops_enable_state_str[] = {
-	[SCX_OPS_PREPPING]	= "prepping",
-	[SCX_OPS_ENABLING]	= "enabling",
-	[SCX_OPS_ENABLED]	= "enabled",
-	[SCX_OPS_DISABLING]	= "disabling",
-	[SCX_OPS_DISABLED]	= "disabled",
-};
-
 static int scx_debug_show(struct seq_file *m, void *v)
 {
 	mutex_lock(&scx_ops_enable_mutex);

From 6b245e863177bd4c670d95b1eb82b22800622b07 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 25 Nov 2023 10:01:32 -1000
Subject: [PATCH 200/304] scx: Fix a straggling atomic64_set

---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e02e4e8c171cb..d50e9dfee5172 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4561,7 +4561,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->scx.sticky_cpu	= -1;
 	p->scx.holding_cpu	= -1;
 	p->scx.kf_mask		= 0;
-	atomic64_set(&p->scx.ops_state, 0);
+	atomic_long_set(&p->scx.ops_state, 0);
 	p->scx.runnable_at	= INITIAL_JIFFIES;
 	p->scx.slice		= SCX_SLICE_DFL;
 #endif

From 70331a6138bef1b6106f7e64882480a0c12f9a09 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 27 Nov 2023 15:10:53 -1000
Subject: [PATCH 201/304] scx: Use .bpf.[sub]skel.h suffix instead of
 .[sub]skel.h when building schedulers

This is to make life easier for the user sched/tools repo which uses meson
to build.
---
 tools/sched_ext/Makefile       | 12 ++++++------
 tools/sched_ext/scx_central.c  |  2 +-
 tools/sched_ext/scx_flatcg.c   |  2 +-
 tools/sched_ext/scx_pair.c     |  2 +-
 tools/sched_ext/scx_qmap.bpf.c |  1 -
 tools/sched_ext/scx_qmap.c     |  2 +-
 tools/sched_ext/scx_simple.c   |  2 +-
 tools/sched_ext/scx_userland.c |  2 +-
 8 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 94985639b299f..2380cbe5845cb 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -169,15 +169,15 @@ $(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h	\
 	$(call msg,CLNG-BPF,,$(notdir $@))
 	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
 
-$(INCLUDE_DIR)/%.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
 	$(eval sched=$(notdir $@))
 	$(call msg,GEN-SKEL,,$(sched))
 	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
 	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
 	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
 	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
-	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $@
-	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
 
 SCX_COMMON_DEPS := scx_common.h user_exit_info.h | $(BINDIR)
 
@@ -190,7 +190,7 @@ c-sched-targets = scx_simple scx_qmap scx_central scx_pair scx_flatcg		\
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
 		$(filter-out %.bpf.c,%.c) \
-		$(INCLUDE_DIR)/%.skel.h \
+		$(INCLUDE_DIR)/%.bpf.skel.h \
 		$(SCX_COMMON_DEPS)
 	$(eval sched=$(notdir $@))
 	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
@@ -229,7 +229,7 @@ install: all
 clean:
 	$(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;)
 	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
-	rm -f *.o *.bpf.o *.skel.h *.subskel.h
+	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
 	rm -f $(c-sched-targets)
 
 help:
@@ -297,5 +297,5 @@ all_targets: $(c-sched-targets) $(rust-sched-targets)
 # delete failed targets
 .DELETE_ON_ERROR:
 
-# keep intermediate (.skel.h, .bpf.o, etc) targets
+# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets
 .SECONDARY:
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 1e298590091f9..914993d31fa83 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -12,7 +12,7 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "scx_common.h"
-#include "scx_central.skel.h"
+#include "scx_central.bpf.skel.h"
 
 const char help_fmt[] =
 "A central FIFO sched_ext scheduler.\n"
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index f824c4b3444aa..886891540cbd8 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -14,7 +14,7 @@
 #include <bpf/bpf.h>
 #include "scx_common.h"
 #include "scx_flatcg.h"
-#include "scx_flatcg.skel.h"
+#include "scx_flatcg.bpf.skel.h"
 
 #ifndef FILEID_KERNFS
 #define FILEID_KERNFS		0xfe
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 48344af0312f5..7c377f180724a 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -11,7 +11,7 @@
 #include <bpf/bpf.h>
 #include "scx_common.h"
 #include "scx_pair.h"
-#include "scx_pair.skel.h"
+#include "scx_pair.bpf.skel.h"
 
 const char help_fmt[] =
 "A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b6365df0fb640..090548ddd2898 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -23,7 +23,6 @@
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #include "scx_common.bpf.h"
-#include <linux/sched/prio.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index edc3d0a4e8000..0b85067703fb8 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -11,7 +11,7 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "scx_common.h"
-#include "scx_qmap.skel.h"
+#include "scx_qmap.bpf.skel.h"
 
 const char help_fmt[] =
 "A simple five-level FIFO queue sched_ext scheduler.\n"
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 900f1c3e74ab5..b09b76be0a267 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -10,7 +10,7 @@
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include "scx_common.h"
-#include "scx_simple.skel.h"
+#include "scx_simple.bpf.skel.h"
 
 const char help_fmt[] =
 "A simple sched_ext scheduler.\n"
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index a750f10df062d..7b5322b3f1c85 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -29,7 +29,7 @@
 
 #include "scx_common.h"
 #include "scx_userland.h"
-#include "scx_userland.skel.h"
+#include "scx_userland.bpf.skel.h"
 
 const char help_fmt[] =
 "A minimal userland sched_ext scheduler.\n"

From 7a1c90f939bee72a215c5729a14d3642b1ef4656 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 27 Nov 2023 15:13:58 -1000
Subject: [PATCH 202/304] scx: Add s/uSIZE typedefs in scx_common.h

The availability of s/uSIZE types are hit and miss. Let's always define them
in terms of stdint types. This makes life easier for the scx user repo.
---
 tools/sched_ext/scx_common.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tools/sched_ext/scx_common.h b/tools/sched_ext/scx_common.h
index 0e93d6b697b8f..7019d9f2da603 100644
--- a/tools/sched_ext/scx_common.h
+++ b/tools/sched_ext/scx_common.h
@@ -7,15 +7,25 @@
 #ifndef __SCHED_EXT_COMMON_H
 #define __SCHED_EXT_COMMON_H
 
+#ifdef __KERNEL__
+#error "Should not be included by BPF programs"
+#endif
+
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 
 #include "user_exit_info.h"
 
-#ifdef __KERNEL__
-#error "Should not be included by BPF programs"
-#endif
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
 
 #define SCX_BUG(__fmt, ...)							\
 	do {									\

From bc7c2afcbfb45836ba58edfbbf7497b884c395d7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 27 Nov 2023 22:00:40 -1000
Subject: [PATCH 203/304] scx_{rusty|layered}: Generate skel file in $OUT_DIR

Currently, skel files are put in src/bpf/.output. Place it inside $OUT_DIR
where build artifacts belong.
---
 tools/sched_ext/scx_layered/build.rs       | 24 ++++++----------------
 tools/sched_ext/scx_layered/src/layered.rs | 12 +++++++++++
 tools/sched_ext/scx_layered/src/main.rs    |  1 -
 tools/sched_ext/scx_rusty/build.rs         | 24 ++++++----------------
 tools/sched_ext/scx_rusty/src/main.rs      |  1 -
 tools/sched_ext/scx_rusty/src/rusty.rs     | 12 +++++++++++
 6 files changed, 36 insertions(+), 38 deletions(-)
 create mode 100644 tools/sched_ext/scx_layered/src/layered.rs
 create mode 100644 tools/sched_ext/scx_rusty/src/rusty.rs

diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index ea0bbd48af825..0f113716db14b 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -5,8 +5,6 @@
 extern crate bindgen;
 
 use std::env;
-use std::fs::create_dir_all;
-use std::path::Path;
 use std::path::PathBuf;
 
 use glob::glob;
@@ -43,17 +41,16 @@ fn bindgen_layered() {
 fn gen_bpf_sched(name: &str) {
     let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
     let clang = env::var("SCX_RUST_CLANG").unwrap();
-    eprintln!("{}", clang);
-    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
-    let skel = Path::new(&outpath);
     let src = format!("./src/bpf/{}.bpf.c", name);
-    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let skel_path = out_path.join(format!("{}.bpf.skel.rs", name));
+    let obj = out_path.join(format!("{}.bpf.o", name));
     SkeletonBuilder::new()
-        .source(src.clone())
-	.obj(obj)
+        .source(&src)
+        .obj(&obj)
         .clang(clang)
         .clang_args(bpf_cflags)
-        .build_and_generate(skel)
+        .build_and_generate(&skel_path)
         .unwrap();
 
     // Trigger rebuild if any .[hc] files are changed in the directory.
@@ -64,14 +61,5 @@ fn gen_bpf_sched(name: &str) {
 
 fn main() {
     bindgen_layered();
-    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
-    // Reasons are because the generated skeleton contains compiler attributes
-    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
-    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
-    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
-    //
-    // However, there is hope! When the above feature stabilizes we can clean this
-    // all up.
-    create_dir_all("./src/bpf/.output").unwrap();
     gen_bpf_sched("layered");
 }
diff --git a/tools/sched_ext/scx_layered/src/layered.rs b/tools/sched_ext/scx_layered/src/layered.rs
new file mode 100644
index 0000000000000..660499863daba
--- /dev/null
+++ b/tools/sched_ext/scx_layered/src/layered.rs
@@ -0,0 +1,12 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+
+// We can't directly include the generated skeleton in main.rs as it may
+// contain compiler attributes that can't be `include!()`ed via macro and we
+// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"),
+// "/bpf.skel.rs")` does not work inside the path attribute yet (see
+// https://github.com/rust-lang/rust/pull/83366).
+
+include!(concat!(env!("OUT_DIR"), "/layered.bpf.skel.rs"));
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 7eb2edf53661f..8f9e1a7ba6964 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -2,7 +2,6 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-#[path = "bpf/.output/layered.skel.rs"]
 mod layered;
 pub use layered::*;
 pub mod layered_sys;
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index c54b8f33c5778..6397a1ed0045a 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -5,8 +5,6 @@
 extern crate bindgen;
 
 use std::env;
-use std::fs::create_dir_all;
-use std::path::Path;
 use std::path::PathBuf;
 
 use libbpf_cargo::SkeletonBuilder;
@@ -42,31 +40,21 @@ fn bindgen_rusty() {
 fn gen_bpf_sched(name: &str) {
     let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
     let clang = env::var("SCX_RUST_CLANG").unwrap();
-    eprintln!("{}", clang);
-    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
-    let skel = Path::new(&outpath);
     let src = format!("./src/bpf/{}.bpf.c", name);
-    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let skel_path = out_path.join(format!("{}.bpf.skel.rs", name));
+    let obj = out_path.join(format!("{}.bpf.o", name));
     SkeletonBuilder::new()
-        .source(src.clone())
-	.obj(obj)
+        .source(&src)
+	.obj(&obj)
         .clang(clang)
         .clang_args(bpf_cflags)
-        .build_and_generate(skel)
+        .build_and_generate(&skel_path)
         .unwrap();
     println!("cargo:rerun-if-changed={}", src);
 }
 
 fn main() {
     bindgen_rusty();
-    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
-    // Reasons are because the generated skeleton contains compiler attributes
-    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
-    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
-    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
-    //
-    // However, there is hope! When the above feature stabilizes we can clean this
-    // all up.
-    create_dir_all("./src/bpf/.output").unwrap();
     gen_bpf_sched("rusty");
 }
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 3b0bcd742e054..841f9a28a788d 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -2,7 +2,6 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-#[path = "bpf/.output/rusty.skel.rs"]
 mod rusty;
 pub use rusty::*;
 pub mod rusty_sys;
diff --git a/tools/sched_ext/scx_rusty/src/rusty.rs b/tools/sched_ext/scx_rusty/src/rusty.rs
new file mode 100644
index 0000000000000..485ad5150dd0e
--- /dev/null
+++ b/tools/sched_ext/scx_rusty/src/rusty.rs
@@ -0,0 +1,12 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+
+// We can't directly include the generated skeleton in main.rs as it may
+// contain compiler attributes that can't be `include!()`ed via macro and we
+// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"),
+// "/bpf.skel.rs")` does not work inside the path attribute yet (see
+// https://github.com/rust-lang/rust/pull/83366).
+
+include!(concat!(env!("OUT_DIR"), "/rusty.bpf.skel.rs"));

From 1d9acf6d6985364c413caf2e4c7a0ab776b1eee8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Nov 2023 08:52:46 -1000
Subject: [PATCH 204/304] scx_{rusty|layered}: ravg_read is now provided by
 scx_utils crate, remove .rs.h file

---
 tools/sched_ext/ravg_read.rs.h          | 82 -------------------------
 tools/sched_ext/scx_layered/Cargo.toml  |  1 +
 tools/sched_ext/scx_layered/src/main.rs |  3 +-
 tools/sched_ext/scx_rusty/Cargo.toml    |  1 +
 tools/sched_ext/scx_rusty/src/main.rs   |  3 +-
 5 files changed, 4 insertions(+), 86 deletions(-)
 delete mode 100644 tools/sched_ext/ravg_read.rs.h

diff --git a/tools/sched_ext/ravg_read.rs.h b/tools/sched_ext/ravg_read.rs.h
deleted file mode 100644
index 4efaa2390aa61..0000000000000
--- a/tools/sched_ext/ravg_read.rs.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/// ravg_read() implementation for rust userland. See ravg_read() in
-/// ravg_impl.bpf.h. We don't yet have a good mechanism to share BPF and
-/// matching rust code across multiple schedulers. For now, include both BPF
-/// and rust code from scheduler implementations.
-fn ravg_read(
-    val: u64,
-    val_at: u64,
-    old: u64,
-    cur: u64,
-    now: u64,
-    half_life: u32,
-    frac_bits: u32,
-) -> f64 {
-    let ravg_1: f64 = (1 << frac_bits) as f64;
-    let half_life = half_life as u64;
-    let val = val as f64;
-    let mut old = old as f64 / ravg_1;
-    let mut cur = cur as f64 / ravg_1;
-
-    let now = now.max(val_at);
-    let normalized_dur = |dur| dur as f64 / half_life as f64;
-
-    //
-    // The following is f64 implementation of BPF ravg_accumulate().
-    //
-    let cur_seq = (now / half_life) as i64;
-    let val_seq = (val_at / half_life) as i64;
-    let seq_delta = (cur_seq - val_seq) as i32;
-
-    if seq_delta > 0 {
-        let full_decay = 2f64.powi(seq_delta);
-
-        // Decay $old and fold $cur into it.
-        old /= full_decay;
-        old += cur / full_decay;
-        cur = 0.0;
-
-        // Fold the oldest period whicy may be partial.
-        old += val * normalized_dur(half_life - val_at % half_life) / full_decay;
-
-        // Pre-computed decayed full-period values.
-        const FULL_SUMS: [f64; 20] = [
-            0.5,
-            0.75,
-            0.875,
-            0.9375,
-            0.96875,
-            0.984375,
-            0.9921875,
-            0.99609375,
-            0.998046875,
-            0.9990234375,
-            0.99951171875,
-            0.999755859375,
-            0.9998779296875,
-            0.99993896484375,
-            0.999969482421875,
-            0.9999847412109375,
-            0.9999923706054688,
-            0.9999961853027344,
-            0.9999980926513672,
-            0.9999990463256836,
-            // Use the same value beyond this point.
-        ];
-
-        // Fold the full periods in the middle.
-        if seq_delta >= 2 {
-            let idx = ((seq_delta - 2) as usize).min(FULL_SUMS.len() - 1);
-            old += val * FULL_SUMS[idx];
-        }
-
-        // Accumulate the current period duration into @cur.
-        cur += val * normalized_dur(now % half_life);
-    } else {
-        cur += val * normalized_dur(now - val_at);
-    }
-
-    //
-    // The following is the blending part of BPF ravg_read().
-    //
-    old * (1.0 - normalized_dur(now % half_life) / 2.0) + cur / 2.0
-}
diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
index 6ba1b98d25cd9..6567ec748be4c 100644
--- a/tools/sched_ext/scx_layered/Cargo.toml
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -7,6 +7,7 @@ description = "Userspace scheduling with BPF for Ads"
 license = "GPL-2.0-only"
 
 [dependencies]
+scx_utils = "0.1"
 anyhow = "1.0"
 bitvec = "1.0"
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 8f9e1a7ba6964..3562fb8bb8f0e 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -33,6 +33,7 @@ use libbpf_rs::skel::SkelBuilder as _;
 use log::debug;
 use log::info;
 use log::trace;
+use scx_utils::ravg::ravg_read;
 use serde::Deserialize;
 use serde::Serialize;
 
@@ -49,8 +50,6 @@ const NR_LSTATS: usize = layered_sys::layer_stat_idx_NR_LSTATS as usize;
 const NR_LAYER_MATCH_KINDS: usize = layered_sys::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
 const CORE_CACHE_LEVEL: u32 = 2;
 
-include!("../../ravg_read.rs.h");
-
 lazy_static::lazy_static! {
     static ref NR_POSSIBLE_CPUS: usize = libbpf_rs::num_possible_cpus().unwrap();
     static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index b0edd3b937d41..77c0205c23e29 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -7,6 +7,7 @@ description = "Userspace scheduling with BPF"
 license = "GPL-2.0-only"
 
 [dependencies]
+scx_utils = "0.1"
 anyhow = "1.0.65"
 bitvec = { version = "1.0", features = ["serde"] }
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 841f9a28a788d..57c568b3e9c98 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -33,13 +33,12 @@ use log::info;
 use log::trace;
 use log::warn;
 use ordered_float::OrderedFloat;
+use scx_utils::ravg::ravg_read;
 
 const RAVG_FRAC_BITS: u32 = rusty_sys::ravg_consts_RAVG_FRAC_BITS;
 const MAX_DOMS: usize = rusty_sys::consts_MAX_DOMS as usize;
 const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize;
 
-include!("../../ravg_read.rs.h");
-
 /// scx_rusty: A multi-domain BPF / userspace hybrid scheduler
 ///
 /// The BPF part does simple vtime or round robin scheduling in each domain

From 2e2daa7e5331a3b576eee3b73dea709c41407276 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Nov 2023 20:45:42 -1000
Subject: [PATCH 205/304] scx_{rusty|layered}: Make naming and build consistent
 between the two rust userland schedulers

- NAME_sys and NAME was used to refer to rust wrapper of the
  bindgen-generated header file and the bpf skeleton, respectively. The NAME
  part is self-referential and thus doesn't really signify anything and _sys
  suffix is arbitrary too. Let's use bpf_intf and bpf_skel instead.

- The env vars that are used during build are a bit unusual and the
  SCX_RUST_CLANG name is a bit confusing as it doesn't indicate it's for
  compiling BPF. Let's use the names BPF_CLANG and BPF_CFLAGS instead.

- build.rs is now identical between the two schedulers.
---
 tools/sched_ext/Makefile                      |  4 +-
 tools/sched_ext/scx_layered/build.rs          | 23 ++---
 .../scx_layered/src/bpf/{layered.h => intf.h} |  6 +-
 .../src/bpf/{layered.bpf.c => main.bpf.c}     |  2 +-
 .../src/bpf_intf.rs}                          |  2 +-
 .../rusty.rs => scx_layered/src/bpf_skel.rs}  |  2 +-
 tools/sched_ext/scx_layered/src/main.rs       | 84 +++++++++----------
 tools/sched_ext/scx_rusty/Cargo.toml          |  1 +
 tools/sched_ext/scx_rusty/build.rs            | 32 ++++---
 .../scx_rusty/src/bpf/{rusty.h => intf.h}     |  6 +-
 .../src/bpf/{rusty.bpf.c => main.bpf.c}       |  2 +-
 .../src/bpf_intf.rs}                          |  2 +-
 .../layered.rs => scx_rusty/src/bpf_skel.rs}  |  2 +-
 tools/sched_ext/scx_rusty/src/main.rs         | 77 +++++++++--------
 14 files changed, 126 insertions(+), 119 deletions(-)
 rename tools/sched_ext/scx_layered/src/bpf/{layered.h => intf.h} (96%)
 rename tools/sched_ext/scx_layered/src/bpf/{layered.bpf.c => main.bpf.c} (99%)
 rename tools/sched_ext/{scx_rusty/src/rusty_sys.rs => scx_layered/src/bpf_intf.rs} (84%)
 rename tools/sched_ext/{scx_rusty/src/rusty.rs => scx_layered/src/bpf_skel.rs} (89%)
 rename tools/sched_ext/scx_rusty/src/bpf/{rusty.h => intf.h} (97%)
 rename tools/sched_ext/scx_rusty/src/bpf/{rusty.bpf.c => main.bpf.c} (99%)
 rename tools/sched_ext/{scx_layered/src/layered_sys.rs => scx_rusty/src/bpf_intf.rs} (83%)
 rename tools/sched_ext/{scx_layered/src/layered.rs => scx_rusty/src/bpf_skel.rs} (89%)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 2380cbe5845cb..43926befe86a4 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -216,8 +216,8 @@ $(addsuffix _deps,$(rust-sched-targets)):
 
 $(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
 	$(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR))
-	$(eval export SCX_RUST_CLANG = $(CLANG))
-	$(eval export SCX_RUST_BPF_CFLAGS= $(BPF_CFLAGS))
+	$(eval export BPF_CLANG = $(CLANG))
+	$(eval export BPF_CFLAGS = $(BPF_CFLAGS))
 	$(eval sched=$(notdir $@))
 	$(Q)cargo build --manifest-path=$(sched)/Cargo.toml $(CARGOFLAGS)
 	$(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@
diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index 0f113716db14b..4f240bfbc43be 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -10,9 +10,10 @@ use std::path::PathBuf;
 use glob::glob;
 use libbpf_cargo::SkeletonBuilder;
 
-const HEADER_PATH: &str = "src/bpf/layered.h";
+const HEADER_PATH: &str = "src/bpf/intf.h";
+const SKEL_NAME: &str = "bpf";
 
-fn bindgen_layered() {
+fn bindgen_bpf_intf() {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
     println!("cargo:rerun-if-changed={}", HEADER_PATH);
 
@@ -34,17 +35,17 @@ fn bindgen_layered() {
     // Write the bindings to the $OUT_DIR/bindings.rs file.
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
     bindings
-        .write_to_file(out_path.join("layered_sys.rs"))
+        .write_to_file(out_path.join("bpf_intf.rs"))
         .expect("Couldn't write bindings!");
 }
 
-fn gen_bpf_sched(name: &str) {
-    let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
-    let clang = env::var("SCX_RUST_CLANG").unwrap();
-    let src = format!("./src/bpf/{}.bpf.c", name);
+fn gen_bpf_skel() {
+    let bpf_cflags = env::var("BPF_CFLAGS").unwrap();
+    let clang = env::var("BPF_CLANG").unwrap();
+    let src = format!("./src/bpf/main.bpf.c");
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let skel_path = out_path.join(format!("{}.bpf.skel.rs", name));
-    let obj = out_path.join(format!("{}.bpf.o", name));
+    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
+    let obj = out_path.join(format!("{}.bpf.o", SKEL_NAME));
     SkeletonBuilder::new()
         .source(&src)
         .obj(&obj)
@@ -60,6 +61,6 @@ fn gen_bpf_sched(name: &str) {
 }
 
 fn main() {
-    bindgen_layered();
-    gen_bpf_sched("layered");
+    bindgen_bpf_intf();
+    gen_bpf_skel();
 }
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/intf.h
similarity index 96%
rename from tools/sched_ext/scx_layered/src/bpf/layered.h
rename to tools/sched_ext/scx_layered/src/bpf/intf.h
index bedfa0650c005..9b9e6cb909a0e 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.h
+++ b/tools/sched_ext/scx_layered/src/bpf/intf.h
@@ -2,8 +2,8 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-#ifndef __LAYERED_H
-#define __LAYERED_H
+#ifndef __INTF_H
+#define __INTF_H
 
 #include <stdbool.h>
 #ifndef __kptr
@@ -97,4 +97,4 @@ struct layer {
 	unsigned int		nr_cpus;	// managed from BPF side
 };
 
-#endif /* __LAYERED_H */
+#endif /* __INTF_H */
diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
similarity index 99%
rename from tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
rename to tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index b0a27f3c71370..4b3330785c5d9 100644
--- a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -1,6 +1,6 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates. */
 #include "../../../scx_common.bpf.h"
-#include "layered.h"
+#include "intf.h"
 
 #include <errno.h>
 #include <stdbool.h>
diff --git a/tools/sched_ext/scx_rusty/src/rusty_sys.rs b/tools/sched_ext/scx_layered/src/bpf_intf.rs
similarity index 84%
rename from tools/sched_ext/scx_rusty/src/rusty_sys.rs
rename to tools/sched_ext/scx_layered/src/bpf_intf.rs
index e948d81e7356e..0ed31f8e08738 100644
--- a/tools/sched_ext/scx_rusty/src/rusty_sys.rs
+++ b/tools/sched_ext/scx_layered/src/bpf_intf.rs
@@ -7,4 +7,4 @@
 #![allow(non_snake_case)]
 #![allow(dead_code)]
 
-include!(concat!(env!("OUT_DIR"), "/rusty_sys.rs"));
+include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
diff --git a/tools/sched_ext/scx_rusty/src/rusty.rs b/tools/sched_ext/scx_layered/src/bpf_skel.rs
similarity index 89%
rename from tools/sched_ext/scx_rusty/src/rusty.rs
rename to tools/sched_ext/scx_layered/src/bpf_skel.rs
index 485ad5150dd0e..063ccf896d61e 100644
--- a/tools/sched_ext/scx_rusty/src/rusty.rs
+++ b/tools/sched_ext/scx_layered/src/bpf_skel.rs
@@ -9,4 +9,4 @@
 // "/bpf.skel.rs")` does not work inside the path attribute yet (see
 // https://github.com/rust-lang/rust/pull/83366).
 
-include!(concat!(env!("OUT_DIR"), "/rusty.bpf.skel.rs"));
+include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 3562fb8bb8f0e..8f4d77db04ea9 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -2,9 +2,9 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-mod layered;
-pub use layered::*;
-pub mod layered_sys;
+mod bpf_skel;
+pub use bpf_skel::*;
+pub mod bpf_intf;
 
 use std::collections::BTreeMap;
 use std::collections::BTreeSet;
@@ -37,17 +37,17 @@ use scx_utils::ravg::ravg_read;
 use serde::Deserialize;
 use serde::Serialize;
 
-const RAVG_FRAC_BITS: u32 = layered_sys::ravg_consts_RAVG_FRAC_BITS;
-const MAX_CPUS: usize = layered_sys::consts_MAX_CPUS as usize;
-const MAX_PATH: usize = layered_sys::consts_MAX_PATH as usize;
-const MAX_COMM: usize = layered_sys::consts_MAX_COMM as usize;
-const MAX_LAYER_MATCH_ORS: usize = layered_sys::consts_MAX_LAYER_MATCH_ORS as usize;
-const MAX_LAYERS: usize = layered_sys::consts_MAX_LAYERS as usize;
-const USAGE_HALF_LIFE: u32 = layered_sys::consts_USAGE_HALF_LIFE;
+const RAVG_FRAC_BITS: u32 = bpf_intf::ravg_consts_RAVG_FRAC_BITS;
+const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
+const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
+const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
+const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
+const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
+const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
 const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
-const NR_GSTATS: usize = layered_sys::global_stat_idx_NR_GSTATS as usize;
-const NR_LSTATS: usize = layered_sys::layer_stat_idx_NR_LSTATS as usize;
-const NR_LAYER_MATCH_KINDS: usize = layered_sys::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
+const NR_GSTATS: usize = bpf_intf::global_stat_idx_NR_GSTATS as usize;
+const NR_LSTATS: usize = bpf_intf::layer_stat_idx_NR_LSTATS as usize;
+const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
 const CORE_CACHE_LEVEL: u32 = 2;
 
 lazy_static::lazy_static! {
@@ -408,7 +408,7 @@ fn format_bitvec(bitvec: &BitVec) -> String {
     output
 }
 
-fn read_cpu_ctxs(skel: &LayeredSkel) -> Result<Vec<layered_sys::cpu_ctx>> {
+fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
     let mut cpu_ctxs = vec![];
     let cpu_ctxs_vec = skel
         .maps()
@@ -418,7 +418,7 @@ fn read_cpu_ctxs(skel: &LayeredSkel) -> Result<Vec<layered_sys::cpu_ctx>> {
         .unwrap();
     for cpu in 0..*NR_POSSIBLE_CPUS {
         cpu_ctxs.push(*unsafe {
-            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const layered_sys::cpu_ctx)
+            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
         });
     }
     Ok(cpu_ctxs)
@@ -432,7 +432,7 @@ struct BpfStats {
 }
 
 impl BpfStats {
-    fn read(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Self {
+    fn read(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Self {
         let mut gstats = vec![0u64; NR_GSTATS];
         let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
 
@@ -501,7 +501,7 @@ struct Stats {
 }
 
 impl Stats {
-    fn read_layer_loads(skel: &mut LayeredSkel, nr_layers: usize) -> (f64, Vec<f64>) {
+    fn read_layer_loads(skel: &mut BpfSkel, nr_layers: usize) -> (f64, Vec<f64>) {
         let now_mono = now_monotonic();
         let layer_loads: Vec<f64> = skel
             .bss()
@@ -524,7 +524,7 @@ impl Stats {
         (layer_loads.iter().sum(), layer_loads)
     }
 
-    fn read_layer_cycles(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Vec<u64> {
+    fn read_layer_cycles(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<u64> {
         let mut layer_cycles = vec![0u64; nr_layers];
 
         for cpu in 0..*NR_POSSIBLE_CPUS {
@@ -536,7 +536,7 @@ impl Stats {
         layer_cycles
     }
 
-    fn new(skel: &mut LayeredSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
+    fn new(skel: &mut BpfSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
         let nr_layers = skel.rodata().nr_layers as usize;
         let bpf_stats = BpfStats::read(&read_cpu_ctxs(skel)?, nr_layers);
 
@@ -563,7 +563,7 @@ impl Stats {
 
     fn refresh(
         &mut self,
-        skel: &mut LayeredSkel,
+        skel: &mut BpfSkel,
         proc_reader: &procfs::ProcReader,
         now: Instant,
     ) -> Result<()> {
@@ -632,7 +632,7 @@ struct UserExitInfo {
 }
 
 impl UserExitInfo {
-    fn read(bpf_uei: &layered_bss_types::user_exit_info) -> Result<Self> {
+    fn read(bpf_uei: &bpf_bss_types::user_exit_info) -> Result<Self> {
         let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) };
 
         let (reason, msg) = if kind != 0 {
@@ -659,7 +659,7 @@ impl UserExitInfo {
         Ok(Self { kind, reason, msg })
     }
 
-    fn exited(bpf_uei: &layered_bss_types::user_exit_info) -> Result<bool> {
+    fn exited(bpf_uei: &bpf_bss_types::user_exit_info) -> Result<bool> {
         Ok(Self::read(bpf_uei)?.kind != 0)
     }
 
@@ -1100,7 +1100,7 @@ impl Layer {
 }
 
 struct Scheduler<'a> {
-    skel: LayeredSkel<'a>,
+    skel: BpfSkel<'a>,
     struct_ops: Option<libbpf_rs::Link>,
     layer_specs: Vec<LayerSpec>,
 
@@ -1121,7 +1121,7 @@ struct Scheduler<'a> {
 }
 
 impl<'a> Scheduler<'a> {
-    fn init_layers(skel: &mut OpenLayeredSkel, specs: &Vec<LayerSpec>) -> Result<()> {
+    fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec<LayerSpec>) -> Result<()> {
         skel.rodata().nr_layers = specs.len() as u32;
 
         for (spec_i, spec) in specs.iter().enumerate() {
@@ -1132,19 +1132,19 @@ impl<'a> Scheduler<'a> {
                     let mt = &mut layer.matches[or_i].matches[and_i];
                     match and {
                         LayerMatch::CgroupPrefix(prefix) => {
-                            mt.kind = layered_sys::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
+                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
                             copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
                         }
                         LayerMatch::CommPrefix(prefix) => {
-                            mt.kind = layered_sys::layer_match_kind_MATCH_COMM_PREFIX as i32;
+                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
                             copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
                         }
                         LayerMatch::NiceAbove(nice) => {
-                            mt.kind = layered_sys::layer_match_kind_MATCH_NICE_ABOVE as i32;
+                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
                             mt.nice_above_or_below = *nice;
                         }
                         LayerMatch::NiceBelow(nice) => {
-                            mt.kind = layered_sys::layer_match_kind_MATCH_NICE_BELOW as i32;
+                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
                             mt.nice_above_or_below = *nice;
                         }
                     }
@@ -1171,7 +1171,7 @@ impl<'a> Scheduler<'a> {
         let mut cpu_pool = CpuPool::new()?;
 
         // Open the BPF prog first for verification.
-        let mut skel_builder = LayeredSkelBuilder::default();
+        let mut skel_builder = BpfSkelBuilder::default();
         skel_builder.obj_builder.debug(opts.verbose > 1);
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
@@ -1227,7 +1227,7 @@ impl<'a> Scheduler<'a> {
         })
     }
 
-    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut layered_bss_types::layer) {
+    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut bpf_bss_types::layer) {
         for bit in 0..layer.cpus.len() {
             if layer.cpus[bit] {
                 bpf_layer.cpus[bit / 8] |= 1 << (bit % 8);
@@ -1323,8 +1323,8 @@ impl<'a> Scheduler<'a> {
         self.prev_processing_dur = self.processing_dur;
 
         let lsum = |idx| stats.bpf_stats.lstats_sums[idx as usize];
-        let total = lsum(layered_sys::layer_stat_idx_LSTAT_LOCAL)
-            + lsum(layered_sys::layer_stat_idx_LSTAT_GLOBAL);
+        let total = lsum(bpf_intf::layer_stat_idx_LSTAT_LOCAL)
+            + lsum(bpf_intf::layer_stat_idx_LSTAT_GLOBAL);
         let lsum_pct = |idx| {
             if total != 0 {
                 lsum(idx) as f64 / total as f64 * 100.0
@@ -1336,11 +1336,11 @@ impl<'a> Scheduler<'a> {
         info!(
             "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms",
             total,
-            lsum_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL),
-            lsum_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE),
-            lsum_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL),
+            lsum_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL),
+            lsum_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE),
+            lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL),
             stats.prev_bpf_stats.gstats
-                [layered_sys::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize],
+                [bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize],
             processing_dur.as_millis(),
         );
 
@@ -1366,8 +1366,8 @@ impl<'a> Scheduler<'a> {
 
         for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
             let lstat = |sidx| stats.bpf_stats.lstats[lidx][sidx as usize];
-            let ltotal = lstat(layered_sys::layer_stat_idx_LSTAT_LOCAL)
-                + lstat(layered_sys::layer_stat_idx_LSTAT_GLOBAL);
+            let ltotal = lstat(bpf_intf::layer_stat_idx_LSTAT_LOCAL)
+                + lstat(bpf_intf::layer_stat_idx_LSTAT_GLOBAL);
             let lstat_pct = |sidx| {
                 if ltotal != 0 {
                     lstat(sidx) as f64 / ltotal as f64 * 100.0
@@ -1390,10 +1390,10 @@ impl<'a> Scheduler<'a> {
                 "  {:<width$}  tot={:7} local={:5.2} open_idle={:5.2} preempt={:5.2} affn_viol={:5.2}",
                 "",
                 ltotal,
-                lstat_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL),
-                lstat_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE),
-                lstat_pct(layered_sys::layer_stat_idx_LSTAT_PREEMPT),
-                lstat_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL),
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL),
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE),
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT),
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL),
                 width = header_width,
             );
             info!(
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index 77c0205c23e29..68db432c5d24a 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -24,6 +24,7 @@ simplelog = "0.12.0"
 [build-dependencies]
 bindgen = { version = "0.61.0" }
 libbpf-cargo = "0.21.0"
+glob = "0.3"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index 6397a1ed0045a..4f240bfbc43be 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -7,11 +7,13 @@ extern crate bindgen;
 use std::env;
 use std::path::PathBuf;
 
+use glob::glob;
 use libbpf_cargo::SkeletonBuilder;
 
-const HEADER_PATH: &str = "src/bpf/rusty.h";
+const HEADER_PATH: &str = "src/bpf/intf.h";
+const SKEL_NAME: &str = "bpf";
 
-fn bindgen_rusty() {
+fn bindgen_bpf_intf() {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
     println!("cargo:rerun-if-changed={}", HEADER_PATH);
 
@@ -33,28 +35,32 @@ fn bindgen_rusty() {
     // Write the bindings to the $OUT_DIR/bindings.rs file.
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
     bindings
-        .write_to_file(out_path.join("rusty_sys.rs"))
+        .write_to_file(out_path.join("bpf_intf.rs"))
         .expect("Couldn't write bindings!");
 }
 
-fn gen_bpf_sched(name: &str) {
-    let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
-    let clang = env::var("SCX_RUST_CLANG").unwrap();
-    let src = format!("./src/bpf/{}.bpf.c", name);
+fn gen_bpf_skel() {
+    let bpf_cflags = env::var("BPF_CFLAGS").unwrap();
+    let clang = env::var("BPF_CLANG").unwrap();
+    let src = format!("./src/bpf/main.bpf.c");
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let skel_path = out_path.join(format!("{}.bpf.skel.rs", name));
-    let obj = out_path.join(format!("{}.bpf.o", name));
+    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
+    let obj = out_path.join(format!("{}.bpf.o", SKEL_NAME));
     SkeletonBuilder::new()
         .source(&src)
-	.obj(&obj)
+        .obj(&obj)
         .clang(clang)
         .clang_args(bpf_cflags)
         .build_and_generate(&skel_path)
         .unwrap();
-    println!("cargo:rerun-if-changed={}", src);
+
+    // Trigger rebuild if any .[hc] files are changed in the directory.
+    for path in glob("./src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
+        println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
+    }
 }
 
 fn main() {
-    bindgen_rusty();
-    gen_bpf_sched("rusty");
+    bindgen_bpf_intf();
+    gen_bpf_skel();
 }
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/intf.h
similarity index 97%
rename from tools/sched_ext/scx_rusty/src/bpf/rusty.h
rename to tools/sched_ext/scx_rusty/src/bpf/intf.h
index 8a7487cf426c3..34e2e5af76a31 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/intf.h
@@ -2,8 +2,8 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-#ifndef __RUSTY_H
-#define __RUSTY_H
+#ifndef __INTF_H
+#define __INTF_H
 
 #include <stdbool.h>
 #ifndef __kptr
@@ -94,4 +94,4 @@ struct dom_ctx {
 	u64 dbg_load_printed_at;
 };
 
-#endif /* __RUSTY_H */
+#endif /* __INTF_H */
diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
similarity index 99%
rename from tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
rename to tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index 7a8b27ceae054..befd8d4c6c6ea 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -37,7 +37,7 @@
  */
 #include "../../../scx_common.bpf.h"
 #include "../../../ravg_impl.bpf.h"
-#include "rusty.h"
+#include "intf.h"
 
 #include <errno.h>
 #include <stdbool.h>
diff --git a/tools/sched_ext/scx_layered/src/layered_sys.rs b/tools/sched_ext/scx_rusty/src/bpf_intf.rs
similarity index 83%
rename from tools/sched_ext/scx_layered/src/layered_sys.rs
rename to tools/sched_ext/scx_rusty/src/bpf_intf.rs
index afc821d388d2c..0ed31f8e08738 100644
--- a/tools/sched_ext/scx_layered/src/layered_sys.rs
+++ b/tools/sched_ext/scx_rusty/src/bpf_intf.rs
@@ -7,4 +7,4 @@
 #![allow(non_snake_case)]
 #![allow(dead_code)]
 
-include!(concat!(env!("OUT_DIR"), "/layered_sys.rs"));
+include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
diff --git a/tools/sched_ext/scx_layered/src/layered.rs b/tools/sched_ext/scx_rusty/src/bpf_skel.rs
similarity index 89%
rename from tools/sched_ext/scx_layered/src/layered.rs
rename to tools/sched_ext/scx_rusty/src/bpf_skel.rs
index 660499863daba..063ccf896d61e 100644
--- a/tools/sched_ext/scx_layered/src/layered.rs
+++ b/tools/sched_ext/scx_rusty/src/bpf_skel.rs
@@ -9,4 +9,4 @@
 // "/bpf.skel.rs")` does not work inside the path attribute yet (see
 // https://github.com/rust-lang/rust/pull/83366).
 
-include!(concat!(env!("OUT_DIR"), "/layered.bpf.skel.rs"));
+include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 57c568b3e9c98..3d802e27d9ea2 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -2,9 +2,9 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-mod rusty;
-pub use rusty::*;
-pub mod rusty_sys;
+mod bpf_skel;
+pub use bpf_skel::*;
+pub mod bpf_intf;
 
 use std::cell::Cell;
 use std::collections::BTreeMap;
@@ -35,9 +35,9 @@ use log::warn;
 use ordered_float::OrderedFloat;
 use scx_utils::ravg::ravg_read;
 
-const RAVG_FRAC_BITS: u32 = rusty_sys::ravg_consts_RAVG_FRAC_BITS;
-const MAX_DOMS: usize = rusty_sys::consts_MAX_DOMS as usize;
-const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize;
+const RAVG_FRAC_BITS: u32 = bpf_intf::ravg_consts_RAVG_FRAC_BITS;
+const MAX_DOMS: usize = bpf_intf::consts_MAX_DOMS as usize;
+const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
 
 /// scx_rusty: A multi-domain BPF / userspace hybrid scheduler
 ///
@@ -420,7 +420,7 @@ impl Tuner {
         })
     }
 
-    fn step(&mut self, skel: &mut RustySkel) -> Result<()> {
+    fn step(&mut self, skel: &mut BpfSkel) -> Result<()> {
         let curr_cpu_stats = self
             .proc_reader
             .read_stat()?
@@ -496,7 +496,7 @@ struct TaskInfo {
 }
 
 struct LoadBalancer<'a, 'b, 'c> {
-    skel: &'a mut RustySkel<'b>,
+    skel: &'a mut BpfSkel<'b>,
     top: Arc<Topology>,
     skip_kworkers: bool,
 
@@ -531,7 +531,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
     const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50;
 
     fn new(
-        skel: &'a mut RustySkel<'b>,
+        skel: &'a mut BpfSkel<'b>,
         top: Arc<Topology>,
         skip_kworkers: bool,
         nr_lb_data_errors: &'c mut u64,
@@ -568,9 +568,8 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
                 .lookup(&key, libbpf_rs::MapFlags::ANY)
                 .context("Failed to lookup dom_ctx")?
             {
-                let dom_ctx = unsafe {
-                    &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx)
-                };
+                let dom_ctx =
+                    unsafe { &*(dom_ctx_map_elem.as_slice().as_ptr() as *const bpf_intf::dom_ctx) };
 
                 let rd = &dom_ctx.load_rd;
                 self.dom_loads[i] = ravg_read(
@@ -620,7 +619,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         //
         // XXX - We can't read task_ctx inline because self.skel.bss()
         // borrows mutably and thus conflicts with self.skel.maps().
-        const MAX_PIDS: u64 = rusty_sys::consts_MAX_DOM_ACTIVE_PIDS as u64;
+        const MAX_PIDS: u64 = bpf_intf::consts_MAX_DOM_ACTIVE_PIDS as u64;
         let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize];
         let mut pids = vec![];
 
@@ -649,7 +648,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 
             if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? {
                 let task_ctx =
-                    unsafe { &*(task_data_elem.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
+                    unsafe { &*(task_data_elem.as_slice().as_ptr() as *const bpf_intf::task_ctx) };
 
                 if task_ctx.dom_id != dom {
                     continue;
@@ -860,7 +859,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
 }
 
 struct Scheduler<'a> {
-    skel: RustySkel<'a>,
+    skel: BpfSkel<'a>,
     struct_ops: Option<libbpf_rs::Link>,
 
     sched_interval: Duration,
@@ -882,7 +881,7 @@ struct Scheduler<'a> {
 impl<'a> Scheduler<'a> {
     fn init(opts: &Opts) -> Result<Self> {
         // Open the BPF prog first for verification.
-        let mut skel_builder = RustySkelBuilder::default();
+        let mut skel_builder = BpfSkelBuilder::default();
         skel_builder.obj_builder.debug(opts.verbose > 0);
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
@@ -1024,7 +1023,7 @@ impl<'a> Scheduler<'a> {
         let mut stats: Vec<u64> = Vec::new();
         let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.top.nr_cpus];
 
-        for stat in 0..rusty_sys::stat_idx_RUSTY_NR_STATS {
+        for stat in 0..bpf_intf::stat_idx_RUSTY_NR_STATS {
             let cpu_stat_vec = stats_map
                 .lookup_percpu(&stat.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
                 .with_context(|| format!("Failed to lookup stat {}", stat))?
@@ -1057,22 +1056,22 @@ impl<'a> Scheduler<'a> {
         imbal: &[f64],
     ) {
         let stat = |idx| stats[idx as usize];
-        let total = stat(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_PINNED)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH)
-            + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY);
+        let total = stat(bpf_intf::stat_idx_RUSTY_STAT_WAKE_SYNC)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_PREV_IDLE)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_IDLE)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_PINNED)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_DISPATCH)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH)
+            + stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY);
 
         info!(
             "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms",
             cpu_busy * 100.0,
-            stats[rusty_sys::stat_idx_RUSTY_STAT_LOAD_BALANCE as usize],
+            stats[bpf_intf::stat_idx_RUSTY_STAT_LOAD_BALANCE as usize],
             load_avg,
-            stats[rusty_sys::stat_idx_RUSTY_STAT_TASK_GET_ERR as usize],
+            stats[bpf_intf::stat_idx_RUSTY_STAT_TASK_GET_ERR as usize],
             self.nr_lb_data_errors,
             processing_dur.as_millis(),
         );
@@ -1082,25 +1081,25 @@ impl<'a> Scheduler<'a> {
         info!(
             "tot={:7} wsync={:5.2} prev_idle={:5.2} greedy_idle={:5.2} pin={:5.2}",
             total,
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PINNED),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_WAKE_SYNC),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_PREV_IDLE),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_IDLE),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_PINNED),
         );
 
         info!(
             "dir={:5.2} dir_greedy={:5.2} dir_greedy_far={:5.2}",
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_DISPATCH),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR),
         );
 
         info!(
             "dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}",
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_KICK_GREEDY),
-            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_REPATRIATE),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_KICK_GREEDY),
+            stat_pct(bpf_intf::stat_idx_RUSTY_STAT_REPATRIATE),
         );
 
         let ti = &self.skel.bss().tune_input;

From 2d46bf9089ddc22d154c38978e7ced2e53648f07 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Nov 2023 21:25:12 -1000
Subject: [PATCH 206/304] scx_{rusty|layered}: Run bindgen's clang with
 CLANG_CFLAGS and remove explicit paths from includes

So that build env can decide where to put these headers.
---
 tools/sched_ext/scx_layered/build.rs           | 12 ++++++++----
 tools/sched_ext/scx_layered/src/bpf/intf.h     |  2 +-
 tools/sched_ext/scx_layered/src/bpf/main.bpf.c |  4 ++--
 tools/sched_ext/scx_rusty/build.rs             | 12 ++++++++----
 tools/sched_ext/scx_rusty/src/bpf/intf.h       |  2 +-
 tools/sched_ext/scx_rusty/src/bpf/main.bpf.c   |  4 ++--
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index 4f240bfbc43be..ff3bb0b76e794 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -21,6 +21,8 @@ fn bindgen_bpf_intf() {
     // to bindgen, and lets you build up options for
     // the resulting bindings.
     let bindings = bindgen::Builder::default()
+        // Should run clang with the same -I options as BPF compilation.
+        .clang_args(env::var("BPF_CFLAGS").unwrap().split_whitespace())
         // The input header we would like to generate
         // bindings for.
         .header(HEADER_PATH)
@@ -41,21 +43,23 @@ fn bindgen_bpf_intf() {
 
 fn gen_bpf_skel() {
     let bpf_cflags = env::var("BPF_CFLAGS").unwrap();
-    let clang = env::var("BPF_CLANG").unwrap();
+    let bpf_clang = env::var("BPF_CLANG").unwrap();
+
     let src = format!("./src/bpf/main.bpf.c");
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
     let obj = out_path.join(format!("{}.bpf.o", SKEL_NAME));
+    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
+
     SkeletonBuilder::new()
         .source(&src)
         .obj(&obj)
-        .clang(clang)
+        .clang(bpf_clang)
         .clang_args(bpf_cflags)
         .build_and_generate(&skel_path)
         .unwrap();
 
     // Trigger rebuild if any .[hc] files are changed in the directory.
-    for path in glob("./src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
+    for path in glob("src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
         println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
     }
 }
diff --git a/tools/sched_ext/scx_layered/src/bpf/intf.h b/tools/sched_ext/scx_layered/src/bpf/intf.h
index 9b9e6cb909a0e..8513779d5f547 100644
--- a/tools/sched_ext/scx_layered/src/bpf/intf.h
+++ b/tools/sched_ext/scx_layered/src/bpf/intf.h
@@ -18,7 +18,7 @@ typedef unsigned long long u64;
 typedef long long s64;
 #endif
 
-#include "../../../ravg.bpf.h"
+#include "ravg.bpf.h"
 
 enum consts {
 	MAX_CPUS_SHIFT		= 9,
diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index 4b3330785c5d9..d4714f89ee69f 100644
--- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -1,5 +1,5 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates. */
-#include "../../../scx_common.bpf.h"
+#include "scx_common.bpf.h"
 #include "intf.h"
 
 #include <errno.h>
@@ -27,7 +27,7 @@ static u32 preempt_cursor;
 #define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
 
 #include "util.bpf.c"
-#include "../../../ravg_impl.bpf.h"
+#include "ravg_impl.bpf.h"
 
 struct user_exit_info uei;
 
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index 4f240bfbc43be..ff3bb0b76e794 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -21,6 +21,8 @@ fn bindgen_bpf_intf() {
     // to bindgen, and lets you build up options for
     // the resulting bindings.
     let bindings = bindgen::Builder::default()
+        // Should run clang with the same -I options as BPF compilation.
+        .clang_args(env::var("BPF_CFLAGS").unwrap().split_whitespace())
         // The input header we would like to generate
         // bindings for.
         .header(HEADER_PATH)
@@ -41,21 +43,23 @@ fn bindgen_bpf_intf() {
 
 fn gen_bpf_skel() {
     let bpf_cflags = env::var("BPF_CFLAGS").unwrap();
-    let clang = env::var("BPF_CLANG").unwrap();
+    let bpf_clang = env::var("BPF_CLANG").unwrap();
+
     let src = format!("./src/bpf/main.bpf.c");
     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
     let obj = out_path.join(format!("{}.bpf.o", SKEL_NAME));
+    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
+
     SkeletonBuilder::new()
         .source(&src)
         .obj(&obj)
-        .clang(clang)
+        .clang(bpf_clang)
         .clang_args(bpf_cflags)
         .build_and_generate(&skel_path)
         .unwrap();
 
     // Trigger rebuild if any .[hc] files are changed in the directory.
-    for path in glob("./src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
+    for path in glob("src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
         println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
     }
 }
diff --git a/tools/sched_ext/scx_rusty/src/bpf/intf.h b/tools/sched_ext/scx_rusty/src/bpf/intf.h
index 34e2e5af76a31..54d28696ac5a7 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/intf.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/intf.h
@@ -19,7 +19,7 @@ typedef unsigned int u32;
 typedef unsigned long long u64;
 #endif
 
-#include "../../../ravg.bpf.h"
+#include "ravg.bpf.h"
 
 enum consts {
 	MAX_CPUS		= 512,
diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index befd8d4c6c6ea..c82ad8973d96a 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -35,8 +35,8 @@
  * task weight, dom mask and current dom in the task_data map and executes the
  * load balance based on userspace populating the lb_data map.
  */
-#include "../../../scx_common.bpf.h"
-#include "../../../ravg_impl.bpf.h"
+#include "scx_common.bpf.h"
+#include "ravg_impl.bpf.h"
 #include "intf.h"
 
 #include <errno.h>

From 65d1b96d784980849399dbf66ece0a8903d04103 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 29 Nov 2023 08:23:05 -1000
Subject: [PATCH 207/304] scx_{rusty|layered}: Factor out build.rs's into
 scx_utils::build_helpers

This greatly simplifies build.rs and allows building more common logic into
build_helpers such as discovering BPF_CFLAGS on its own without depending on
upper level Makefile. Some caveats:

- Dropped static libbpf-sys dep. scx_utils is out of kernel tree and pulls
  in libbpf-sys through libbpf-cargo which conflicts with the explicit
  libbpf-sys dependency. This means that we use packaged version of
  libbpf-cargo for skel generation. Should be fine.

- Path dependency for scx_utils is temporary during development. Should be
  dropped later.
---
 tools/sched_ext/scx_layered/Cargo.toml |  7 +--
 tools/sched_ext/scx_layered/build.rs   | 65 +-------------------------
 tools/sched_ext/scx_rusty/Cargo.toml   |  7 +--
 tools/sched_ext/scx_rusty/build.rs     | 65 +-------------------------
 4 files changed, 8 insertions(+), 136 deletions(-)

diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
index 6567ec748be4c..53726a2233082 100644
--- a/tools/sched_ext/scx_layered/Cargo.toml
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -7,7 +7,6 @@ description = "Userspace scheduling with BPF for Ads"
 license = "GPL-2.0-only"
 
 [dependencies]
-scx_utils = "0.1"
 anyhow = "1.0"
 bitvec = "1.0"
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
@@ -15,17 +14,15 @@ ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7"
 lazy_static = "1.4"
 libbpf-rs = "0.21"
-libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
 libc = "0.2"
 log = "0.4"
+scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 simplelog = "0.12"
 
 [build-dependencies]
-bindgen = { version = "0.61" }
-libbpf-cargo = "0.21"
-glob = "0.3"
+scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index ff3bb0b76e794..bbaa1ea71c0cb 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -2,69 +2,8 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-extern crate bindgen;
-
-use std::env;
-use std::path::PathBuf;
-
-use glob::glob;
-use libbpf_cargo::SkeletonBuilder;
-
-const HEADER_PATH: &str = "src/bpf/intf.h";
-const SKEL_NAME: &str = "bpf";
-
-fn bindgen_bpf_intf() {
-    // Tell cargo to invalidate the built crate whenever the wrapper changes
-    println!("cargo:rerun-if-changed={}", HEADER_PATH);
-
-    // The bindgen::Builder is the main entry point
-    // to bindgen, and lets you build up options for
-    // the resulting bindings.
-    let bindings = bindgen::Builder::default()
-        // Should run clang with the same -I options as BPF compilation.
-        .clang_args(env::var("BPF_CFLAGS").unwrap().split_whitespace())
-        // The input header we would like to generate
-        // bindings for.
-        .header(HEADER_PATH)
-        // Tell cargo to invalidate the built crate whenever any of the
-        // included header files changed.
-        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
-        // Finish the builder and generate the bindings.
-        .generate()
-        // Unwrap the Result and panic on failure.
-        .expect("Unable to generate bindings");
-
-    // Write the bindings to the $OUT_DIR/bindings.rs file.
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    bindings
-        .write_to_file(out_path.join("bpf_intf.rs"))
-        .expect("Couldn't write bindings!");
-}
-
-fn gen_bpf_skel() {
-    let bpf_cflags = env::var("BPF_CFLAGS").unwrap();
-    let bpf_clang = env::var("BPF_CLANG").unwrap();
-
-    let src = format!("./src/bpf/main.bpf.c");
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let obj = out_path.join(format!("{}.bpf.o", SKEL_NAME));
-    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
-
-    SkeletonBuilder::new()
-        .source(&src)
-        .obj(&obj)
-        .clang(bpf_clang)
-        .clang_args(bpf_cflags)
-        .build_and_generate(&skel_path)
-        .unwrap();
-
-    // Trigger rebuild if any .[hc] files are changed in the directory.
-    for path in glob("src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
-        println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
-    }
-}
 
 fn main() {
-    bindgen_bpf_intf();
-    gen_bpf_skel();
+    scx_utils::build_helpers::bindgen_bpf_intf(None, None);
+    scx_utils::build_helpers::gen_bpf_skel(None, None, None);
 }
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index 68db432c5d24a..a25eb1099f483 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -7,7 +7,6 @@ description = "Userspace scheduling with BPF"
 license = "GPL-2.0-only"
 
 [dependencies]
-scx_utils = "0.1"
 anyhow = "1.0.65"
 bitvec = { version = "1.0", features = ["serde"] }
 clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
@@ -15,16 +14,14 @@ ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7.0"
 hex = "0.4.3"
 libbpf-rs = "0.21.0"
-libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
 libc = "0.2.137"
 log = "0.4.17"
 ordered-float = "3.4.0"
+scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
 simplelog = "0.12.0"
 
 [build-dependencies]
-bindgen = { version = "0.61.0" }
-libbpf-cargo = "0.21.0"
-glob = "0.3"
+scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index ff3bb0b76e794..bbaa1ea71c0cb 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -2,69 +2,8 @@
 
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
-extern crate bindgen;
-
-use std::env;
-use std::path::PathBuf;
-
-use glob::glob;
-use libbpf_cargo::SkeletonBuilder;
-
-const HEADER_PATH: &str = "src/bpf/intf.h";
-const SKEL_NAME: &str = "bpf";
-
-fn bindgen_bpf_intf() {
-    // Tell cargo to invalidate the built crate whenever the wrapper changes
-    println!("cargo:rerun-if-changed={}", HEADER_PATH);
-
-    // The bindgen::Builder is the main entry point
-    // to bindgen, and lets you build up options for
-    // the resulting bindings.
-    let bindings = bindgen::Builder::default()
-        // Should run clang with the same -I options as BPF compilation.
-        .clang_args(env::var("BPF_CFLAGS").unwrap().split_whitespace())
-        // The input header we would like to generate
-        // bindings for.
-        .header(HEADER_PATH)
-        // Tell cargo to invalidate the built crate whenever any of the
-        // included header files changed.
-        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
-        // Finish the builder and generate the bindings.
-        .generate()
-        // Unwrap the Result and panic on failure.
-        .expect("Unable to generate bindings");
-
-    // Write the bindings to the $OUT_DIR/bindings.rs file.
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    bindings
-        .write_to_file(out_path.join("bpf_intf.rs"))
-        .expect("Couldn't write bindings!");
-}
-
-fn gen_bpf_skel() {
-    let bpf_cflags = env::var("BPF_CFLAGS").unwrap();
-    let bpf_clang = env::var("BPF_CLANG").unwrap();
-
-    let src = format!("./src/bpf/main.bpf.c");
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let obj = out_path.join(format!("{}.bpf.o", SKEL_NAME));
-    let skel_path = out_path.join(format!("{}_skel.rs", SKEL_NAME));
-
-    SkeletonBuilder::new()
-        .source(&src)
-        .obj(&obj)
-        .clang(bpf_clang)
-        .clang_args(bpf_cflags)
-        .build_and_generate(&skel_path)
-        .unwrap();
-
-    // Trigger rebuild if any .[hc] files are changed in the directory.
-    for path in glob("src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
-        println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
-    }
-}
 
 fn main() {
-    bindgen_bpf_intf();
-    gen_bpf_skel();
+    scx_utils::build_helpers::bindgen_bpf_intf(None, None);
+    scx_utils::build_helpers::gen_bpf_skel(None, None, None);
 }

From df7ea88b9774882c45aec43233523d14ef465657 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 29 Nov 2023 21:13:34 -1000
Subject: [PATCH 208/304] scx_{rusty|layered}: Follow scx_utils::BpfBuilder API
 updates

---
 tools/sched_ext/scx_layered/build.rs | 8 ++++++--
 tools/sched_ext/scx_rusty/build.rs   | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index bbaa1ea71c0cb..1c972ae8d82e2 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -4,6 +4,10 @@
 // GNU General Public License version 2.
 
 fn main() {
-    scx_utils::build_helpers::bindgen_bpf_intf(None, None);
-    scx_utils::build_helpers::gen_bpf_skel(None, None, None);
+    scx_utils::BpfBuilder::new()
+        .unwrap()
+        .enable_intf("src/bpf/intf.h", "bpf_intf.rs")
+        .enable_skel("src/bpf/main.bpf.c", "bpf")
+        .build()
+        .unwrap();
 }
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index bbaa1ea71c0cb..1c972ae8d82e2 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -4,6 +4,10 @@
 // GNU General Public License version 2.
 
 fn main() {
-    scx_utils::build_helpers::bindgen_bpf_intf(None, None);
-    scx_utils::build_helpers::gen_bpf_skel(None, None, None);
+    scx_utils::BpfBuilder::new()
+        .unwrap()
+        .enable_intf("src/bpf/intf.h", "bpf_intf.rs")
+        .enable_skel("src/bpf/main.bpf.c", "bpf")
+        .build()
+        .unwrap();
 }

From 5f200bb4bc14af1857ba39a71fb387709592c6f1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 3 Dec 2023 12:16:08 -1000
Subject: [PATCH 209/304] scx_{layered, rusty}: Minor build updates

---
 tools/sched_ext/scx_layered/Cargo.toml | 4 ++--
 tools/sched_ext/scx_layered/build.rs   | 2 +-
 tools/sched_ext/scx_rusty/Cargo.toml   | 4 ++--
 tools/sched_ext/scx_rusty/build.rs     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
index 53726a2233082..1de594354ed0c 100644
--- a/tools/sched_ext/scx_layered/Cargo.toml
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -16,13 +16,13 @@ lazy_static = "1.4"
 libbpf-rs = "0.21"
 libc = "0.2"
 log = "0.4"
-scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
+scx_utils = "0.2"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 simplelog = "0.12"
 
 [build-dependencies]
-scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
+scx_utils = "0.2"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
index 1c972ae8d82e2..d26db839cd9e1 100644
--- a/tools/sched_ext/scx_layered/build.rs
+++ b/tools/sched_ext/scx_layered/build.rs
@@ -1,5 +1,5 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
-
+//
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
 
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index a25eb1099f483..23d37b70e4eb1 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -17,11 +17,11 @@ libbpf-rs = "0.21.0"
 libc = "0.2.137"
 log = "0.4.17"
 ordered-float = "3.4.0"
-scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
+scx_utils = "0.2"
 simplelog = "0.12.0"
 
 [build-dependencies]
-scx_utils = { path = "/home/htejun/os/scx/rust/scx_utils", version = "0.1" }
+scx_utils = "0.2"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
index 1c972ae8d82e2..d26db839cd9e1 100644
--- a/tools/sched_ext/scx_rusty/build.rs
+++ b/tools/sched_ext/scx_rusty/build.rs
@@ -1,5 +1,5 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
-
+//
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
 

From 47c9356e203f713389e746c89fa3850ac82d529f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 3 Dec 2023 12:32:44 -1000
Subject: [PATCH 210/304] scx: Move common headers under include/scx

---
 tools/sched_ext/Makefile                                  | 8 ++++----
 .../{scx_common.bpf.h => include/scx/common.bpf.h}        | 0
 tools/sched_ext/{scx_common.h => include/scx/common.h}    | 0
 tools/sched_ext/{ => include/scx}/ravg.bpf.h              | 0
 tools/sched_ext/{ => include/scx}/ravg_impl.bpf.h         | 0
 tools/sched_ext/{ => include/scx}/user_exit_info.h        | 0
 tools/sched_ext/scx_central.bpf.c                         | 2 +-
 tools/sched_ext/scx_central.c                             | 2 +-
 tools/sched_ext/scx_flatcg.bpf.c                          | 2 +-
 tools/sched_ext/scx_flatcg.c                              | 2 +-
 tools/sched_ext/scx_layered/src/bpf/intf.h                | 2 +-
 tools/sched_ext/scx_layered/src/bpf/main.bpf.c            | 4 ++--
 tools/sched_ext/scx_pair.bpf.c                            | 2 +-
 tools/sched_ext/scx_pair.c                                | 2 +-
 tools/sched_ext/scx_qmap.bpf.c                            | 2 +-
 tools/sched_ext/scx_qmap.c                                | 2 +-
 tools/sched_ext/scx_rusty/src/bpf/intf.h                  | 2 +-
 tools/sched_ext/scx_rusty/src/bpf/main.bpf.c              | 4 ++--
 tools/sched_ext/scx_simple.bpf.c                          | 2 +-
 tools/sched_ext/scx_simple.c                              | 2 +-
 tools/sched_ext/scx_userland.bpf.c                        | 2 +-
 tools/sched_ext/scx_userland.c                            | 2 +-
 22 files changed, 22 insertions(+), 22 deletions(-)
 rename tools/sched_ext/{scx_common.bpf.h => include/scx/common.bpf.h} (100%)
 rename tools/sched_ext/{scx_common.h => include/scx/common.h} (100%)
 rename tools/sched_ext/{ => include/scx}/ravg.bpf.h (100%)
 rename tools/sched_ext/{ => include/scx}/ravg_impl.bpf.h (100%)
 rename tools/sched_ext/{ => include/scx}/user_exit_info.h (100%)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 43926befe86a4..4cfdf4ece2b13 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -91,7 +91,7 @@ endif
 
 CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
 	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
-	  -I$(TOOLSINCDIR) -I$(APIDIR)
+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include
 
 CARGOFLAGS := --release --target-dir $(OUTPUT_DIR)
 ifneq ($(CARGO_OFFLINE),)
@@ -122,7 +122,7 @@ endef
 
 BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
-	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)				\
+	     -I$(INCLUDE_DIR) -I$(CURDIR)/include -I$(APIDIR)			\
 	     -I../../include							\
 	     $(call get_sys_includes,$(CLANG))					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
@@ -163,7 +163,7 @@ else
 	$(Q)cp "$(VMLINUX_H)" $@
 endif
 
-$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h	\
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/common.bpf.h \
 		       user_exit_info.h ravg.bpf.h ravg_impl.bpf.h		\
 		       | $(BPFOBJ) $(SCXOBJ_DIR)
 	$(call msg,CLNG-BPF,,$(notdir $@))
@@ -179,7 +179,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
 	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
 
-SCX_COMMON_DEPS := scx_common.h user_exit_info.h | $(BINDIR)
+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
 ################
 # C schedulers #
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
similarity index 100%
rename from tools/sched_ext/scx_common.bpf.h
rename to tools/sched_ext/include/scx/common.bpf.h
diff --git a/tools/sched_ext/scx_common.h b/tools/sched_ext/include/scx/common.h
similarity index 100%
rename from tools/sched_ext/scx_common.h
rename to tools/sched_ext/include/scx/common.h
diff --git a/tools/sched_ext/ravg.bpf.h b/tools/sched_ext/include/scx/ravg.bpf.h
similarity index 100%
rename from tools/sched_ext/ravg.bpf.h
rename to tools/sched_ext/include/scx/ravg.bpf.h
diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/include/scx/ravg_impl.bpf.h
similarity index 100%
rename from tools/sched_ext/ravg_impl.bpf.h
rename to tools/sched_ext/include/scx/ravg_impl.bpf.h
diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
similarity index 100%
rename from tools/sched_ext/user_exit_info.h
rename to tools/sched_ext/include/scx/user_exit_info.h
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index de05779619878..5faf0d22d32b6 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -45,7 +45,7 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 914993d31fa83..1092443230743 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -11,7 +11,7 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "scx_common.h"
+#include <scx/common.h>
 #include "scx_central.bpf.skel.h"
 
 const char help_fmt[] =
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 2db3d8d45e683..79d625b385045 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -43,7 +43,7 @@
  * within by using nested weighted vtime scheduling by default. The
  * cgroup-internal scheduling can be switched to FIFO with the -f option.
  */
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
 #include "user_exit_info.h"
 #include "scx_flatcg.h"
 
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 886891540cbd8..6a6e47c83ede7 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -12,7 +12,7 @@
 #include <fcntl.h>
 #include <time.h>
 #include <bpf/bpf.h>
-#include "scx_common.h"
+#include <scx/common.h>
 #include "scx_flatcg.h"
 #include "scx_flatcg.bpf.skel.h"
 
diff --git a/tools/sched_ext/scx_layered/src/bpf/intf.h b/tools/sched_ext/scx_layered/src/bpf/intf.h
index 8513779d5f547..000f48b4d7502 100644
--- a/tools/sched_ext/scx_layered/src/bpf/intf.h
+++ b/tools/sched_ext/scx_layered/src/bpf/intf.h
@@ -18,7 +18,7 @@ typedef unsigned long long u64;
 typedef long long s64;
 #endif
 
-#include "ravg.bpf.h"
+#include <scx/ravg.bpf.h>
 
 enum consts {
 	MAX_CPUS_SHIFT		= 9,
diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index d4714f89ee69f..cd74769e952b9 100644
--- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -1,5 +1,6 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates. */
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
+#include <scx/ravg_impl.bpf.h>
 #include "intf.h"
 
 #include <errno.h>
@@ -27,7 +28,6 @@ static u32 preempt_cursor;
 #define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
 
 #include "util.bpf.c"
-#include "ravg_impl.bpf.h"
 
 struct user_exit_info uei;
 
diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c
index 9c9cf97f4feeb..9da53c4b3e634 100644
--- a/tools/sched_ext/scx_pair.bpf.c
+++ b/tools/sched_ext/scx_pair.bpf.c
@@ -115,7 +115,7 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
 #include "scx_pair.h"
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 7c377f180724a..693f095b8c660 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -9,7 +9,7 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "scx_common.h"
+#include <scx/common.h>
 #include "scx_pair.h"
 #include "scx_pair.bpf.skel.h"
 
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 090548ddd2898..831df3f644d5a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -22,7 +22,7 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 0b85067703fb8..d817115c0b0a8 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,7 +10,7 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "scx_common.h"
+#include <scx/common.h>
 #include "scx_qmap.bpf.skel.h"
 
 const char help_fmt[] =
diff --git a/tools/sched_ext/scx_rusty/src/bpf/intf.h b/tools/sched_ext/scx_rusty/src/bpf/intf.h
index 54d28696ac5a7..f295695102051 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/intf.h
+++ b/tools/sched_ext/scx_rusty/src/bpf/intf.h
@@ -19,7 +19,7 @@ typedef unsigned int u32;
 typedef unsigned long long u64;
 #endif
 
-#include "ravg.bpf.h"
+#include <scx/ravg.bpf.h>
 
 enum consts {
 	MAX_CPUS		= 512,
diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index c82ad8973d96a..befaba957105e 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -35,8 +35,8 @@
  * task weight, dom mask and current dom in the task_data map and executes the
  * load balance based on userspace populating the lb_data map.
  */
-#include "scx_common.bpf.h"
-#include "ravg_impl.bpf.h"
+#include <scx/common.bpf.h>
+#include <scx/ravg_impl.bpf.h>
 #include "intf.h"
 
 #include <errno.h>
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 56b589d7f6630..eeb7414883a67 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -20,7 +20,7 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index b09b76be0a267..5c5589770a2fc 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -9,7 +9,7 @@
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
-#include "scx_common.h"
+#include <scx/common.h>
 #include "scx_simple.bpf.skel.h"
 
 const char help_fmt[] =
diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
index 9e107a874a92d..f2791a6aecc8b 100644
--- a/tools/sched_ext/scx_userland.bpf.c
+++ b/tools/sched_ext/scx_userland.bpf.c
@@ -21,7 +21,7 @@
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #include <string.h>
-#include "scx_common.bpf.h"
+#include <scx/common.bpf.h>
 #include "scx_userland.h"
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index 7b5322b3f1c85..fef028a1756e0 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -27,7 +27,7 @@
 #include <sys/queue.h>
 #include <sys/syscall.h>
 
-#include "scx_common.h"
+#include <scx/common.h>
 #include "scx_userland.h"
 #include "scx_userland.bpf.skel.h"
 

From d6bd20a939a930a650a3041e155088ddf5636b40 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 3 Dec 2023 14:32:53 -1000
Subject: [PATCH 211/304] scx: More include path and build updates

---
 tools/sched_ext/Makefile                       |  6 +++---
 tools/sched_ext/gnu/stubs.h                    |  1 -
 tools/sched_ext/include/bpf-compat/gnu/stubs.h | 11 +++++++++++
 tools/sched_ext/scx_flatcg.bpf.c               |  1 -
 tools/sched_ext/scx_layered/Cargo.toml         |  4 ++--
 tools/sched_ext/scx_rusty/Cargo.toml           |  4 ++--
 6 files changed, 18 insertions(+), 9 deletions(-)
 delete mode 100644 tools/sched_ext/gnu/stubs.h
 create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 4cfdf4ece2b13..b9e42771a4c50 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -122,7 +122,8 @@ endef
 
 BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
-	     -I$(INCLUDE_DIR) -I$(CURDIR)/include -I$(APIDIR)			\
+	     -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat			\
+	     -I$(INCLUDE_DIR) -I$(APIDIR)					\
 	     -I../../include							\
 	     $(call get_sys_includes,$(CLANG))					\
 	     -Wall -Wno-compare-distinct-pointer-types				\
@@ -163,8 +164,7 @@ else
 	$(Q)cp "$(VMLINUX_H)" $@
 endif
 
-$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/common.bpf.h \
-		       user_exit_info.h ravg.bpf.h ravg_impl.bpf.h		\
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h		\
 		       | $(BPFOBJ) $(SCXOBJ_DIR)
 	$(call msg,CLNG-BPF,,$(notdir $@))
 	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
diff --git a/tools/sched_ext/gnu/stubs.h b/tools/sched_ext/gnu/stubs.h
deleted file mode 100644
index 719225b166269..0000000000000
--- a/tools/sched_ext/gnu/stubs.h
+++ /dev/null
@@ -1 +0,0 @@
-/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
new file mode 100644
index 0000000000000..ad7d139ce907b
--- /dev/null
+++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
@@ -0,0 +1,11 @@
+/*
+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when
+ * compiling BPF files although its content doesn't play any role. The file in
+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is
+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus
+ * stubs-32.h is selected. However, the file is not there if the system doesn't
+ * have 32bit glibc devel package installed leading to a build failure.
+ *
+ * The problem is worked around by making this file available in the include
+ * search paths before the system one when building BPF.
+ */
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 79d625b385045..84a60d7e4024b 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -44,7 +44,6 @@
  * cgroup-internal scheduling can be switched to FIFO with the -f option.
  */
 #include <scx/common.bpf.h>
-#include "user_exit_info.h"
 #include "scx_flatcg.h"
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
index 1de594354ed0c..19dd0243a9f2a 100644
--- a/tools/sched_ext/scx_layered/Cargo.toml
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -16,13 +16,13 @@ lazy_static = "1.4"
 libbpf-rs = "0.21"
 libc = "0.2"
 log = "0.4"
-scx_utils = "0.2"
+scx_utils = "0.3"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 simplelog = "0.12"
 
 [build-dependencies]
-scx_utils = "0.2"
+scx_utils = "0.3"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index 23d37b70e4eb1..309643687d0c6 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -17,11 +17,11 @@ libbpf-rs = "0.21.0"
 libc = "0.2.137"
 log = "0.4.17"
 ordered-float = "3.4.0"
-scx_utils = "0.2"
+scx_utils = "0.3"
 simplelog = "0.12.0"
 
 [build-dependencies]
-scx_utils = "0.2"
+scx_utils = "0.3"
 
 [features]
 enable_backtrace = []

From 234eb2c19d60ad472b33fb35f81ffb6f7cf92de7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 4 Dec 2023 11:31:00 -1000
Subject: [PATCH 212/304] scx_sync: Sync scheduler changes from
 https://github.com/sched-ext/scx

---
 .../sched_ext/scx_layered/src/bpf/main.bpf.c  | 20 +++++++++++++++----
 tools/sched_ext/scx_rusty/src/bpf/main.bpf.c  | 18 +++++++++++++++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index cd74769e952b9..98d9418e1adf1 100644
--- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -250,10 +250,20 @@ struct layer *lookup_layer(int idx)
 	return &layers[idx];
 }
 
+/*
+ * Because the layer membership is by the default hierarchy cgroups rather than
+ * the CPU controller membership, we can't use ops.cgroup_move(). Let's iterate
+ * the tasks manually and set refresh_layer.
+ *
+ * The iteration isn't synchronized and may fail spuriously. It's not a big
+ * practical problem as process migrations are very rare in most modern systems.
+ * That said, we eventually want this to be based on CPU controller membership.
+ */
 SEC("tp_btf/cgroup_attach_task")
 int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 	     struct task_struct *leader, bool threadgroup)
 {
+	struct list_head *thread_head;
 	struct task_struct *next;
 	struct task_ctx *tctx;
 	int leader_pid = leader->pid;
@@ -265,6 +275,8 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 	if (!threadgroup)
 		return 0;
 
+	thread_head = &leader->signal->thread_head;
+
 	if (!(next = bpf_task_acquire(leader))) {
 		scx_bpf_error("failed to acquire leader");
 		return 0;
@@ -274,18 +286,18 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 		struct task_struct *p;
 		int pid;
 
-		p = container_of(next->thread_group.next, struct task_struct, thread_group);
+		p = container_of(next->thread_node.next, struct task_struct, thread_node);
 		bpf_task_release(next);
 
-		pid = BPF_CORE_READ(p, pid);
-		if (pid == leader_pid) {
+		if (&p->thread_node == thread_head) {
 			next = NULL;
 			break;
 		}
 
+		pid = BPF_CORE_READ(p, pid);
 		next = bpf_task_from_pid(pid);
 		if (!next) {
-			scx_bpf_error("thread iteration failed");
+			bpf_printk("scx_layered: tp_cgroup_attach_task: thread iteration failed");
 			break;
 		}
 
diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index befaba957105e..c85e95bf372a4 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -966,7 +966,13 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 	pid_t pid;
 
 	pid = p->pid;
-	ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST);
+
+	/*
+	 * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
+	 * fail spuriously due to BPF recursion protection triggering
+	 * unnecessarily.
+	 */
+	ret = bpf_map_update_elem(&task_data, &pid, &taskc, 0 /*BPF_NOEXIST*/);
 	if (ret) {
 		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return ret;
@@ -1003,7 +1009,15 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p)
 {
 	pid_t pid = p->pid;
-	long ret = bpf_map_delete_elem(&task_data, &pid);
+	long ret;
+
+	/*
+	 * XXX - There's no reason delete should fail here but BPF's recursion
+	 * protection can unnecessarily fail the operation. The fact that
+	 * deletions aren't reliable means that we sometimes leak task_ctx and
+	 * can't use BPF_NOEXIST on allocation in .prep_enable().
+	 */
+	ret = bpf_map_delete_elem(&task_data, &pid);
 	if (ret) {
 		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return;

From 25a5d102d28a2cc389ec5a9cb50873f7844f2edf Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 4 Dec 2023 15:35:36 -0600
Subject: [PATCH 213/304] scx: Disable vtime ordering for internal DSQs

Internal DSQs, i.e. SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL, have somewhat
special behavior in that they're automatically consumed by the internal
ext.c logic. A user could therefore accidentally starve tasks on either
of the DSQs if they dispatch to both the vtime and FIFO queues, as
they're consumed in a specific order by the internal logic. It likely
doesn't make sense to ever use both FIFO and PRIQ ordering in the same
DSQ, so let's explicitly disable it for the internal DSQs. In a
follow-on change, we'll error out a scheduler if a user dispatches to
both FIFO and vtime for any DSQ.

Reported-by: Changwoo Min <changwoo@igalia.com>
Signed-off-by: David Vernet <void@manifault.com>
---
 Documentation/scheduler/sched-ext.rst |  8 +++++---
 kernel/sched/ext.c                    | 23 ++++++++++++++---------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 25ddb535c2972..b67346cf52354 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -194,9 +194,11 @@ a task is never queued on the BPF scheduler and both the local and global
 DSQs are consumed automatically.
 
 ``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
-``scx_bpf_dispatch_vtime()`` for the priority queue. See the function
-documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for more
-information.
+``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as
+``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue
+dispatching, and must be dispatched to with ``scx_bpf_dispatch()``.  See the
+function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for
+more information.
 
 Where to Look
 =============
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ebe295eb78de4..c4fea4f7ff059 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -647,6 +647,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	}
 
 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+		WARN_ON_ONCE(dsq->id & SCX_DSQ_FLAG_BUILTIN);
 		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
 			      scx_dsq_priq_less);
@@ -1772,16 +1773,9 @@ static struct task_struct *first_local_task(struct rq *rq)
 {
 	struct rb_node *rb_node;
 
-	if (!list_empty(&rq->scx.local_dsq.fifo))
-		return list_first_entry(&rq->scx.local_dsq.fifo,
+	WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq));
+	return list_first_entry_or_null(&rq->scx.local_dsq.fifo,
 					struct task_struct, scx.dsq_node.fifo);
-
-	rb_node = rb_first_cached(&rq->scx.local_dsq.priq);
-	if (rb_node)
-		return container_of(rb_node,
-				    struct task_struct, scx.dsq_node.priq);
-
-	return NULL;
 }
 
 static struct task_struct *pick_next_task_scx(struct rq *rq)
@@ -3948,6 +3942,17 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice,
 	if (!scx_dispatch_preamble(p, enq_flags))
 		return;
 
+	/*
+	 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from their FIFO
+	 * queues. To avoid confusion and accidentally starving
+	 * vtime-dispatched tasks by FIFO-dispatched tasks, we disallow any
+	 * internal DSQ from doing vtime ordering of tasks.
+	 */
+	if (dsq_id & SCX_DSQ_FLAG_BUILTIN) {
+		scx_ops_error("Cannot use vtime ordering for built-in DSQs");
+		return;
+	}
+
 	if (slice)
 		p->scx.slice = slice;
 	else

From 346fd9d1176f52ecee6739787c6c7c1869c1263d Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 4 Dec 2023 15:53:19 -0600
Subject: [PATCH 214/304] scx: Enforce either/or usage of DSQ FIFO/PRIQ
 dispatching

Currently, a user can do both FIFO and PRIQ dispatching to a DSQ. This
can result in non-intuitive behavior. For example, if a user
PRIQ-dispatches to a DSQ, and then subsequently FIFO dispatches, an
scx_bpf_consume() operation will always favor the FIFO-dispatched task.
While we could add something like an scx_bpf_consume_vtime() kfunc,
given that there's not a clear use-case for doing both types of
dispatching in a single DSQ, for now we'll elect to just enforce that
only a single type is being used at any given time.

Reported-by: Changwoo Min <changwoo@igalia.com>
Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c4fea4f7ff059..1095d494cdf24 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -651,11 +651,19 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
 			      scx_dsq_priq_less);
+		/* A DSQ should only be using either FIFO or PRIQ enqueuing. */
+		if (unlikely(!list_empty(&dsq->fifo)))
+			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+				      dsq->id);
 	} else {
 		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
 			list_add(&p->scx.dsq_node.fifo, &dsq->fifo);
 		else
 			list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo);
+		/* A DSQ should only be using either FIFO or PRIQ enqueuing. */
+		if (unlikely(rb_first_cached(&dsq->priq)))
+			scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+				      dsq->id);
 	}
 	dsq->nr++;
 	p->scx.dsq = dsq;

From 03b9a1fcd4d10d93276b49d9d5aa5237ab875323 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 4 Dec 2023 15:40:29 -1000
Subject: [PATCH 215/304] scx_sync: Sync scheduler changes from
 https://github.com/sched-ext/scx

---
 tools/sched_ext/scx_simple.bpf.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index eeb7414883a67..7485acbc4f509 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -30,6 +30,8 @@ const volatile bool switch_partial;
 static u64 vtime_now;
 struct user_exit_info uei;
 
+#define SHARED_DSQ 0
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(u32));
@@ -65,7 +67,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 	stat_inc(1);	/* count global queueing */
 
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
 	} else {
 		u64 vtime = p->scx.dsq_vtime;
 
@@ -76,11 +78,16 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
 			vtime = vtime_now - SCX_SLICE_DFL;
 
-		scx_bpf_dispatch_vtime(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, vtime,
+		scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
 				       enq_flags);
 	}
 }
 
+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_consume(SHARED_DSQ);
+}
+
 void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
 {
 	if (fifo_sched)
@@ -119,11 +126,12 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p,
 	p->scx.dsq_vtime = vtime_now;
 }
 
-s32 BPF_STRUCT_OPS(simple_init)
+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
 {
 	if (!switch_partial)
 		scx_bpf_switch_all();
-	return 0;
+
+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
 }
 
 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
@@ -134,6 +142,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 SEC(".struct_ops.link")
 struct sched_ext_ops simple_ops = {
 	.enqueue		= (void *)simple_enqueue,
+	.dispatch		= (void *)simple_dispatch,
 	.running		= (void *)simple_running,
 	.stopping		= (void *)simple_stopping,
 	.enable			= (void *)simple_enable,

From 782f273a94f338c791393302ba6b5dddf128c0a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Dec 2023 14:12:51 -1000
Subject: [PATCH 216/304] scx: Remove now unused $rb_node from
 first_local_task()

---
 kernel/sched/ext.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1095d494cdf24..53ee906aa2b68 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1779,8 +1779,6 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 
 static struct task_struct *first_local_task(struct rq *rq)
 {
-	struct rb_node *rb_node;
-
 	WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq));
 	return list_first_entry_or_null(&rq->scx.local_dsq.fifo,
 					struct task_struct, scx.dsq_node.fifo);

From 9c18e3dfb07099033d5fc0155de2b1793f9ce979 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 7 Dec 2023 19:22:37 -0600
Subject: [PATCH 217/304] scx_sync: Sync scheduler changes from
 https://github.com/sched-ext/scx

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/scx_central.bpf.c     | 21 +++++++++++++++++++--
 tools/sched_ext/scx_central.c         |  3 +++
 tools/sched_ext/scx_rusty/src/main.rs | 22 +++++++++++-----------
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 5faf0d22d32b6..4f398249fb2cc 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -60,6 +60,7 @@ const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
 const volatile u64 slice_ns = SCX_SLICE_DFL;
 
+bool timer_pinned = true;
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
 u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
 u64 nr_overflows;
@@ -255,7 +256,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 	s32 i, curr_cpu;
 
 	curr_cpu = bpf_get_smp_processor_id();
-	if (curr_cpu != central_cpu) {
+	if (timer_pinned && (curr_cpu != central_cpu)) {
 		scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
 			      curr_cpu, central_cpu);
 		return 0;
@@ -308,12 +309,28 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 	if (!timer)
 		return -ESRCH;
 
-	if (bpf_get_smp_processor_id() != central_cpu)
+	if (bpf_get_smp_processor_id() != central_cpu) {
+		scx_bpf_error("init from non-central CPU");
 		return -EINVAL;
+	}
 
 	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
 	bpf_timer_set_callback(timer, central_timerfn);
+
 	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+	/*
+	 * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
+	 * kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
+	 * Retry without the PIN. This would be the perfect use case for
+	 * bpf_core_enum_value_exists() but the enum type doesn't have a name
+	 * and can't be used with bpf_core_enum_value_exists(). Oh well...
+	 */
+	if (ret == -EINVAL) {
+		timer_pinned = false;
+		ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	}
+	if (ret)
+		scx_bpf_error("bpf_timer_start failed (%d)", ret);
 	return ret;
 }
 
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 1092443230743..a3d22409e9ce5 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -98,6 +98,9 @@ int main(int argc, char **argv)
 	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
 	SCX_BUG_ON(!link, "Failed to attach struct_ops");
 
+	if (!skel->data->timer_pinned)
+		printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n");
+
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
 		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index 3d802e27d9ea2..ff7cc9d80a7ea 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -190,17 +190,6 @@ fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
 fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
     match (curr, prev) {
         (
-            procfs::CpuStat {
-                user_usec: Some(prev_user),
-                nice_usec: Some(prev_nice),
-                system_usec: Some(prev_system),
-                idle_usec: Some(prev_idle),
-                iowait_usec: Some(prev_iowait),
-                irq_usec: Some(prev_irq),
-                softirq_usec: Some(prev_softirq),
-                stolen_usec: Some(prev_stolen),
-                ..
-            },
             procfs::CpuStat {
                 user_usec: Some(curr_user),
                 nice_usec: Some(curr_nice),
@@ -212,6 +201,17 @@ fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
                 stolen_usec: Some(curr_stolen),
                 ..
             },
+            procfs::CpuStat {
+                user_usec: Some(prev_user),
+                nice_usec: Some(prev_nice),
+                system_usec: Some(prev_system),
+                idle_usec: Some(prev_idle),
+                iowait_usec: Some(prev_iowait),
+                irq_usec: Some(prev_irq),
+                softirq_usec: Some(prev_softirq),
+                stolen_usec: Some(prev_stolen),
+                ..
+            },
         ) => {
             let idle_usec = curr_idle - prev_idle;
             let iowait_usec = curr_iowait - prev_iowait;

From 36d38385c5090fd07b9b8e8bb2a0f2c533cb20cb Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 7 Dec 2023 22:32:24 -0600
Subject: [PATCH 218/304] scx: Add missing ) to $(error) invocation in Makefile

We're missing a closing ) on a branch that we never take. Let's close it
just for correctness.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/sched_ext/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index b9e42771a4c50..7db68d2053765 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -26,7 +26,7 @@ CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
 
 ifeq ($(CROSS_COMPILE),)
 ifeq ($(CLANG_TARGET_FLAGS),)
-$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk
+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk)
 else
 CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
 endif # CLANG_TARGET_FLAGS

From d3f9558717ff9b768ba0b7d156885d45e96edf5c Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 7 Dec 2023 19:37:15 -0600
Subject: [PATCH 219/304] scx: Add skeleton for scx testing framework

We should build a selftest suite to do some basic sanity testing of scx.
Some elements are going to be borrowed from tools/testing/selftests/bpf,
as we're going to be building and loading BPF progs, and sometimes
verifying that BPF progs fail to load.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/.gitignore    |   2 +
 tools/testing/selftests/scx/Makefile      | 171 ++++++++++++++++++++++
 tools/testing/selftests/scx/config        |   5 +
 tools/testing/selftests/scx/minimal.bpf.c |  32 ++++
 tools/testing/selftests/scx/minimal.c     |  42 ++++++
 5 files changed, 252 insertions(+)
 create mode 100644 tools/testing/selftests/scx/.gitignore
 create mode 100644 tools/testing/selftests/scx/Makefile
 create mode 100644 tools/testing/selftests/scx/config
 create mode 100644 tools/testing/selftests/scx/minimal.bpf.c
 create mode 100644 tools/testing/selftests/scx/minimal.c

diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore
new file mode 100644
index 0000000000000..72fc34154e98b
--- /dev/null
+++ b/tools/testing/selftests/scx/.gitignore
@@ -0,0 +1,2 @@
+minimal
+build/
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
new file mode 100644
index 0000000000000..c331cfc380b6c
--- /dev/null
+++ b/tools/testing/selftests/scx/Makefile
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../../../build/Build.include
+include ../../../scripts/Makefile.arch
+include ../../../scripts/Makefile.include
+include ../lib.mk
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := gcc
+endif # LLVM
+
+ifneq ($(CROSS_COMPILE),)
+$(error CROSS_COMPILE not supported for scx selftests)
+endif # CROSS_COMPILE
+
+CURDIR := $(abspath .)
+REPOROOT := $(abspath ../../../..)
+TOOLSDIR := $(REPOROOT)/tools
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(REPOROOT)/include/generated
+GENHDR := $(GENDIR)/autoconf.h
+SCXTOOLSDIR := $(TOOLSDIR)/sched_ext
+SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include
+
+OUTPUT_DIR := $(CURDIR)/build
+OBJ_DIR := $(OUTPUT_DIR)/obj
+INCLUDE_DIR := $(OUTPUT_DIR)/include
+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
+LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a
+DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool
+
+VMLINUX_BTF_PATHS ?= ../../../../vmlinux					\
+		     /sys/kernel/btf/vmlinux					\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR)
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz -lpthread
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
+	     -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat			\
+	     -I$(INCLUDE_DIR) -I$(APIDIR) -I$(SCXTOOLSINCDIR)			\
+	     -I$(REPOROOT)/include						\
+	     $(call get_sys_includes,$(CLANG))					\
+	     -Wall -Wno-compare-distinct-pointer-types				\
+	     -Wno-incompatible-function-pointer-types				\
+	     -O2 -mcpu=v3
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf			\
+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+	       $(INCLUDE_DIR) $(SCXOBJ_DIR))
+
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
+	   $(APIDIR)/linux/bpf.h						\
+	   | $(OBJ_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(LIBBPF_OUTPUT) | $(HOST_BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
+		    LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/				\
+		    prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h	| $(BPFOBJ) $(SCXOBJ_DIR)
+	$(call msg,CLNG-BPF,,$(notdir $@))
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR)
+	$(eval sched=$(notdir $@))
+	$(call msg,GEN-SKEL,,$(sched))
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
+
+################
+# C schedulers #
+################
+c-sched-targets := minimal
+
+$(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
+	$(eval sched=$(notdir $@))
+	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
+	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(LIBBPF_OUTPUT) $(LDFLAGS)
+
+TEST_GEN_PROGS := $(c-sched-targets)
+
+override define CLEAN
+	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
+	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
+	rm -f $(TEST_GEN_PROGS)
+endef
+
+all: $(TEST_GEN_PROGS)
+
+.PHONY: all clean help
+
+.DEFAULT_GOAL := all
+
+.DELETE_ON_ERROR:
+
+.SECONDARY:
diff --git a/tools/testing/selftests/scx/config b/tools/testing/selftests/scx/config
new file mode 100644
index 0000000000000..fef8f81ad3776
--- /dev/null
+++ b/tools/testing/selftests/scx/config
@@ -0,0 +1,5 @@
+CONFIG_SCHED_DEBUG=y
+CONFIG_SCHED_CLASS_EXT=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_EXT_GROUP_SCHED=y
diff --git a/tools/testing/selftests/scx/minimal.bpf.c b/tools/testing/selftests/scx/minimal.bpf.c
new file mode 100644
index 0000000000000..14b3d44d90db5
--- /dev/null
+++ b/tools/testing/selftests/scx/minimal.bpf.c
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A completely minimal scheduler.
+ *
+ * This scheduler defines the absolute minimal set of struct sched_ext_ops
+ * fields: its name (and until a bug is fixed in libbpf, also an ops.running()
+ * callback). It should _not_ fail to be loaded, and can be used to exercise
+ * the default scheduling paths in ext.c.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+void BPF_STRUCT_OPS(minimal_running, struct task_struct *p)
+{}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops minimal_ops = {
+	/*
+	 * It shouldn't be necessary to define this minimal_running op, but
+	 * libbpf currently expects that a struct_ops map will always have at
+	 * least one struct_ops prog when loading. Until that issue is fixed,
+	 * let's also define a minimal prog so that we can load and test.
+	 */
+	.enable			= minimal_running,
+	.name			= "minimal",
+};
diff --git a/tools/testing/selftests/scx/minimal.c b/tools/testing/selftests/scx/minimal.c
new file mode 100644
index 0000000000000..722f0d5023994
--- /dev/null
+++ b/tools/testing/selftests/scx/minimal.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "minimal.bpf.skel.h"
+
+static volatile int exit_req;
+
+static void sigint_handler(int simple)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct minimal *skel;
+	struct bpf_link *link;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = minimal__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.minimal_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	sleep(1);
+	bpf_link__destroy(link);
+	minimal__destroy(skel);
+
+	return 0;
+}

From dad3fb67ca1cbef87ce700e83a55835e5921ce8a Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 27 Dec 2023 15:18:05 +0100
Subject: [PATCH 220/304] kernfs: convert kernfs_idr_lock to an irq safe raw
 spinlock

bpf_cgroup_from_id() (provided by sched-ext) needs to acquire
kernfs_idr_lock and it can be used in the scheduler dispatch path with
rq->_lock held.

But any kernfs function that is acquiring kernfs_idr_lock can be
interrupted by a scheduler tick, that would try to acquire rq->_lock,
triggering the following deadlock scenario:

        CPU0                    CPU1
        ----                    ----
   lock(kernfs_idr_lock);
                                lock(rq->__lock);
                                lock(kernfs_idr_lock);
   <Interrupt>
    lock(rq->__lock);

More in general, considering that bpf_cgroup_from_id() is provided as a
kfunc, potentially similar deadlock conditions can be triggered from any
kprobe/tracepoint/fentry.

For this reason, in order to prevent any potential deadlock scenario,
convert kernfs_idr_lock to a raw irq safe spinlock.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 fs/kernfs/dir.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e72..9ce7d2872b554 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
  */
 static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
 static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
-static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
+static DEFINE_RAW_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
@@ -539,6 +539,7 @@ void kernfs_put(struct kernfs_node *kn)
 {
 	struct kernfs_node *parent;
 	struct kernfs_root *root;
+	unsigned long flags;
 
 	if (!kn || !atomic_dec_and_test(&kn->count))
 		return;
@@ -563,9 +564,9 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs, NULL);
 		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
 	kmem_cache_free(kernfs_node_cache, kn);
 
 	kn = parent;
@@ -607,6 +608,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	u32 id_highbits;
 	int ret;
+	unsigned long irqflags;
 
 	name = kstrdup_const(name, GFP_KERNEL);
 	if (!name)
@@ -617,13 +619,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 		goto err_out1;
 
 	idr_preload(GFP_KERNEL);
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
 	ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
 	if (ret >= 0 && ret < root->last_id_lowbits)
 		root->id_highbits++;
 	id_highbits = root->id_highbits;
 	root->last_id_lowbits = ret;
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
 	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
@@ -659,9 +661,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	return kn;
 
  err_out3:
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
@@ -702,8 +704,9 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
+	unsigned long flags;
 
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -727,10 +730,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
 	return kn;
 err_unlock:
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
 	return NULL;
 }
 

From 6b747e0ee5fca284330d065a0c777d1991290bc4 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 27 Dec 2023 17:25:54 +0100
Subject: [PATCH 221/304] sched_ext: fix race in scx_move_task() with exiting
 tasks

There is a race with exiting tasks in scx_move_tasks() where we may fail
to check for autogroup tasks, leading to the following oops:

 WARNING: CPU: 2 PID: 100 at kernel/sched/ext.c:2571 scx_move_task+0x9f/0xb0
 ...
 Sched_ext: flatcg (enabled+all), task: runnable_at=-5ms
 RIP: 0010:scx_move_task+0x9f/0xb0
 Call Trace:
  <TASK>
  ? scx_move_task+0x9f/0xb0
  ? __warn+0x85/0x170
  ? scx_move_task+0x9f/0xb0
  ? report_bug+0x171/0x1a0
  ? handle_bug+0x3b/0x70
  ? exc_invalid_op+0x17/0x70
  ? asm_exc_invalid_op+0x1a/0x20
  ? scx_move_task+0x9f/0xb0
  sched_move_task+0x104/0x300
  do_exit+0x37d/0xb70
  ? lock_release+0xbe/0x270
  do_group_exit+0x37/0xa0
  __x64_sys_exit_group+0x18/0x20
  do_syscall_64+0x44/0xf0
  entry_SYSCALL_64_after_hwframe+0x6f/0x77

And a related NULL pointer dereference afterwards:

 BUG: kernel NULL pointer dereference, address: 0000000000000148

Prevent this by skipping scx_move_tasks() actions for exiting tasks.

Moreover, make scx_move_tasks() more reliable by triggering only the
WARN_ON_ONCE() and returning, instead of triggering also the bug
afterwards.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 kernel/sched/ext.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 53ee906aa2b68..634fcb7cb2431 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2560,15 +2560,22 @@ void scx_move_task(struct task_struct *p)
 	/*
 	 * We're called from sched_move_task() which handles both cgroup and
 	 * autogroup moves. Ignore the latter.
+	 *
+	 * Also ignore exiting tasks, because in the exit path tasks transition
+	 * from the autogroup to the root group, so task_group_is_autogroup()
+	 * alone isn't able to catch exiting autogroup tasks. This is safe for
+	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING
+	 * tasks.
 	 */
-	if (task_group_is_autogroup(task_group(p)))
+	if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p)))
 		return;
 
 	if (!scx_enabled())
 		return;
 
 	if (SCX_HAS_OP(cgroup_move)) {
-		WARN_ON_ONCE(!p->scx.cgrp_moving_from);
+		if (WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+			return;
 		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
 			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
 	}

From 07acdca60031900f7d2ae824951342e0cd98f74e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 21 Dec 2023 17:47:21 -0600
Subject: [PATCH 222/304] scx: Support direct dispatching from ops.select_cpu()

A common pattern in schedulers is to find and reserve an idle core in
ops.select_cpu(), and to then use a task local storage map to specify
that the task should be enqueued in SCX_DSQ_LOCAL on the ops.enqueue()
path. At the same time, we also have a special SCX_TASK_ENQ_LOCAL
enqueue flag which is used by scx_select_cpu_dfl() to notify
ops.enqueue() that it may want to do a local enqueue.

Taking a step back, direct dispatch is something that should be
supported from the ops.select_cpu() path as well. The contract is that
doing a direct dispatch to SCX_DSQ_LOCAL will dispatch the task to the
local CPU of whatever is returned by ops.select_cpu(). With that in
mind, let's just extend the API a bit to support direct dispatch from
ops.select_cpu().

Signed-off-by: David Vernet <void@manifault.com>
---
 Documentation/scheduler/sched-ext.rst |  12 +++-
 include/linux/sched/ext.h             |  18 ++++-
 init/init_task.c                      |   1 +
 kernel/sched/core.c                   |   1 +
 kernel/sched/ext.c                    | 100 ++++++++++++++++----------
 5 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index b67346cf52354..fc45366b7bd5b 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -142,11 +142,19 @@ The following briefly shows how a waking task is scheduled and executed.
    scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
    using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
 
+   A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by
+   calling ``scx_bpf_dispatch()``. If the task is dispatched to
+   ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the
+   local DSQ of whichever CPU is returned from ``ops.select_cpu()``.
+   Additionally, dispatching directly from ``ops.select_cpu()`` will cause the
+   ``ops.enqueue()`` callback to be skipped.
+
    Note that the scheduler core will ignore an invalid CPU selection, for
    example, if it's outside the allowed cpumask of the task.
 
-2. Once the target CPU is selected, ``ops.enqueue()`` is invoked. It can
-   make one of the following decisions:
+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the
+   task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()``
+   can make one of the following decisions:
 
    * Immediately dispatch the task to either the global or local DSQ by
      calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index b20a7620b93d7..5096389ed89db 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -184,6 +184,11 @@ struct sched_ext_ops {
 	 * If an idle CPU is returned, the CPU is kicked and will try to
 	 * dispatch. While an explicit custom mechanism can be added,
 	 * select_cpu() serves as the default way to wake up idle CPUs.
+	 *
+	 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
+	 * is dispatched, the ops.enqueue() callback will be skipped. Finally,
+	 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
+	 * local DSQ of whatever CPU is returned by this callback.
 	 */
 	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
 
@@ -196,6 +201,9 @@ struct sched_ext_ops {
 	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
 	 * scheduler owns @p and if it fails to dispatch @p, the task will
 	 * stall.
+	 *
+	 * If @p was dispatched from ops.select_cpu(), this callback is
+	 * skipped.
 	 */
 	void (*enqueue)(struct task_struct *p, u64 enq_flags);
 
@@ -597,7 +605,7 @@ struct scx_dispatch_q {
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
-	SCX_TASK_ENQ_LOCAL	= 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
+	SCX_TASK_DDSP_PRIQ	= 1 << 2, /* task should be enqueued on priq when directly dispatched */
 
 	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
 	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
@@ -630,7 +638,7 @@ enum scx_kf_mask {
 	SCX_KF_CPU_RELEASE	= 1 << 2, /* ops.cpu_release() */
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
 	SCX_KF_DISPATCH		= 1 << 3, /* ops.dispatch() */
-	SCX_KF_ENQUEUE		= 1 << 4, /* ops.enqueue() */
+	SCX_KF_ENQUEUE		= 1 << 4, /* ops.enqueue() and ops.select_cpu() */
 	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
 
 	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
@@ -685,6 +693,12 @@ struct sched_ext_entity {
 	 */
 	u64			dsq_vtime;
 
+	/*
+	 * Used to track when a task has requested a direct dispatch from the
+	 * ops.select_cpu() path.
+	 */
+	u64			ddsq_id;
+
 	/*
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.
diff --git a/init/init_task.c b/init/init_task.c
index 20fa6efc07f2e..56c49c02d830f 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -113,6 +113,7 @@ struct task_struct init_task
 		.ops_state	= ATOMIC_INIT(0),
 		.runnable_at	= INITIAL_JIFFIES,
 		.slice		= SCX_SLICE_DFL,
+		.ddsq_id	= SCX_DSQ_INVALID,
 	},
 #endif
 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d50e9dfee5172..c8885037f2a30 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4564,6 +4564,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	atomic_long_set(&p->scx.ops_state, 0);
 	p->scx.runnable_at	= INITIAL_JIFFIES;
 	p->scx.slice		= SCX_SLICE_DFL;
+	p->scx.ddsq_id		= SCX_DSQ_INVALID;
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 634fcb7cb2431..1882d1ccc019d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -786,18 +786,24 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
 	return dsq;
 }
 
-static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p,
-			    u64 dsq_id, u64 enq_flags)
+static void mark_direct_dispatch(struct task_struct *ddsp_task,
+				 struct task_struct *p, u64 dsq_id,
+				 u64 enq_flags)
 {
-	struct scx_dispatch_q *dsq;
+	/*
+	 * Mark that dispatch already happened from ops.select_cpu() or
+	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
+	 * which can never match a valid task pointer.
+	 */
+	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
 
-	/* @p must match the task which is being enqueued */
+	/* @p must match the task on the enqueue path */
 	if (unlikely(p != ddsp_task)) {
 		if (IS_ERR(ddsp_task))
 			scx_ops_error("%s[%d] already direct-dispatched",
 				      p->comm, p->pid);
 		else
-			scx_ops_error("enqueueing %s[%d] but trying to direct-dispatch %s[%d]",
+			scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
 				      ddsp_task->comm, ddsp_task->pid,
 				      p->comm, p->pid);
 		return;
@@ -814,16 +820,28 @@ static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p
 		return;
 	}
 
+	WARN_ON_ONCE(p->scx.ddsq_id != SCX_DSQ_INVALID);
+	WARN_ON_ONCE(p->scx.flags & SCX_TASK_DDSP_PRIQ);
+
+	p->scx.ddsq_id = dsq_id;
+	if (enq_flags & SCX_ENQ_DSQ_PRIQ)
+		p->scx.flags |= SCX_TASK_DDSP_PRIQ;
+}
+
+static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+{
+	struct scx_dispatch_q *dsq;
+
 	touch_core_sched_dispatch(task_rq(p), p);
 
-	dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
-	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+	if (p->scx.flags & SCX_TASK_DDSP_PRIQ) {
+		enq_flags |= SCX_ENQ_DSQ_PRIQ;
+		p->scx.flags &= ~SCX_TASK_DDSP_PRIQ;
+	}
 
-	/*
-	 * Mark that dispatch already happened by spoiling direct_dispatch_task
-	 * with a non-NULL value which can never match a valid task pointer.
-	 */
-	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
+	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsq_id, p);
+	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+	p->scx.ddsq_id = SCX_DSQ_INVALID;
 }
 
 static bool test_rq_online(struct rq *rq)
@@ -843,10 +861,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 
 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
 
-	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
-		enq_flags |= SCX_ENQ_LOCAL;
-		p->scx.flags &= ~SCX_TASK_ENQ_LOCAL;
-	}
+	if (p->scx.ddsq_id != SCX_DSQ_INVALID)
+		goto direct;
 
 	/* rq migration */
 	if (sticky_cpu == cpu_of(rq))
@@ -889,13 +905,19 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 
 	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
 
+	*ddsp_taskp = NULL;
+	if (p->scx.ddsq_id != SCX_DSQ_INVALID)
+		goto direct;
+
 	/*
 	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
 	 * dequeue may be waiting. The store_release matches their load_acquire.
 	 */
-	if (*ddsp_taskp == p)
-		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
-	*ddsp_taskp = NULL;
+	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+	return;
+
+direct:
+	direct_dispatch(p, enq_flags);
 	return;
 
 local:
@@ -2011,10 +2033,8 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
 	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING)) {
 		cpu = smp_processor_id();
-		if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
-			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-			return cpu;
-		}
+		if (cpumask_test_cpu(cpu, p->cpus_ptr))
+			goto dispatch_local;
 	}
 
 	if (p->nr_cpus_allowed == 1)
@@ -2027,38 +2047,44 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 	if (sched_smt_active()) {
 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
 		    test_and_clear_cpu_idle(prev_cpu)) {
-			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-			return prev_cpu;
+			cpu = prev_cpu;
+			goto dispatch_local;
 		}
 
 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
-		if (cpu >= 0) {
-			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-			return cpu;
-		}
+		if (cpu >= 0)
+			goto dispatch_local;
 	}
 
 	if (test_and_clear_cpu_idle(prev_cpu)) {
-		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-		return prev_cpu;
+		cpu = prev_cpu;
+		goto dispatch_local;
 	}
 
 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
-	if (cpu >= 0) {
-		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-		return cpu;
-	}
+	if (cpu >= 0)
+		goto dispatch_local;
 
 	return prev_cpu;
+
+dispatch_local:
+	p->scx.ddsq_id = SCX_DSQ_LOCAL;
+	return cpu;
 }
 
 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
 {
 	if (SCX_HAS_OP(select_cpu)) {
 		s32 cpu;
+		struct task_struct **ddsp_taskp;
+
+		ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+		WARN_ON_ONCE(*ddsp_taskp);
+		*ddsp_taskp = p;
 
-		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_REST, select_cpu, p, prev_cpu,
+		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE, select_cpu, p, prev_cpu,
 					   wake_flags);
+		*ddsp_taskp = NULL;
 		if (ops_cpu_valid(cpu)) {
 			return cpu;
 		} else {
@@ -3870,7 +3896,7 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
 
 	ddsp_task = __this_cpu_read(direct_dispatch_task);
 	if (ddsp_task) {
-		direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+		mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
 		return;
 	}
 

From 08fc865889d2d4fe344f158a619e4c5ec1bb7268 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 21 Dec 2023 18:59:28 -0600
Subject: [PATCH 223/304] scx: Remove SCX_ENQ_LOCAL flag

Now that we support dispatching directly from ops.select_cpu(), the
SCX_ENQ_LOCAL flag isn't needed. The last place it was used was on the
SCX_ENQ_LAST path to control whether a task would be dispatched locally
if ops.enqueue() wasn't defined. It doesn't really make sense to define
SCX_OPS_ENQ_LAST but not ops.enqueue(), so let's remove SCX_ENQ_LOCAL
and validate that SCX_OPS_ENQ_LAST is never passed if ops.enqueue()
isn't defined.

Signed-off-by: David Vernet <void@manifault.com>
---
 Documentation/scheduler/sched-ext.rst |  5 +----
 kernel/sched/ext.c                    | 28 ++++++++++++++++++++-------
 kernel/sched/ext.h                    | 11 ++---------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index fc45366b7bd5b..c2b8dca57af53 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -86,10 +86,7 @@ optional. The following modified excerpt is from
 
     void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
     {
-            if (enq_flags & SCX_ENQ_LOCAL)
-                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-            else
-                    scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+            scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
     }
 
     void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1882d1ccc019d..796713aaff227 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -886,12 +886,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	    (enq_flags & SCX_ENQ_LAST))
 		goto local;
 
-	if (!SCX_HAS_OP(enqueue)) {
-		if (enq_flags & SCX_ENQ_LOCAL)
-			goto local;
-		else
-			goto global;
-	}
+	if (!SCX_HAS_OP(enqueue))
+		goto global;
 
 	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
@@ -1793,7 +1789,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 		 * follow-up scheduling event.
 		 */
 		if (list_empty(&rq->scx.local_dsq.fifo))
-			do_enqueue_task(rq, p, SCX_ENQ_LAST | SCX_ENQ_LOCAL, -1);
+			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
 		else
 			do_enqueue_task(rq, p, 0, -1);
 	}
@@ -3217,6 +3213,20 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
 	return helper;
 }
 
+static int validate_ops(const struct sched_ext_ops *ops)
+{
+	/*
+	 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
+	 * ops.enqueue() callback isn't implemented.
+	 */
+	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
+		scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int scx_ops_enable(struct sched_ext_ops *ops)
 {
 	struct scx_task_iter sti;
@@ -3280,6 +3290,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			goto err_disable;
 	}
 
+	ret = validate_ops(ops);
+	if (ret)
+		goto err_disable;
+
 	WARN_ON_ONCE(scx_dsp_buf);
 	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
 	scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch,
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 27248760f4ccb..a8f72efe39b36 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -43,8 +43,8 @@ enum scx_enq_flags {
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
-	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with
-	 * %SCX_ENQ_LAST and %SCX_ENQ_LOCAL flags set.
+	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+	 * %SCX_ENQ_LAST flag set.
 	 *
 	 * If the BPF scheduler wants to continue executing the task,
 	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
@@ -54,13 +54,6 @@ enum scx_enq_flags {
 	 */
 	SCX_ENQ_LAST		= 1LLU << 41,
 
-	/*
-	 * A hint indicating that it's advisable to enqueue the task on the
-	 * local dsq of the currently selected CPU. Currently used by
-	 * select_cpu_dfl() and together with %SCX_ENQ_LAST.
-	 */
-	SCX_ENQ_LOCAL		= 1LLU << 42,
-
 	/* high 8 bits are internal */
 	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
 

From fadfa2fb5894723302e579a0edbd17b595572d91 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 22 Dec 2023 00:30:53 -0600
Subject: [PATCH 224/304] scx: Add scx_bpf_select_cpu_dfl() kfunc

Some scheduler implementations may want to have ops.enqueue() invoked
even if scx_select_cpu_dfl() finds an idle core for the enqueuing task
to run on. In order to enable this, we can add a new
scx_bpf_select_cpu_dfl() kfunc which allows a BPF scheduler to get the
same behavior as the default ops.select_cpu() implementation, and then
decide whether they want to dispatch directly from ops.select_cpu().

Signed-off-by: David Vernet <void@manifault.com>
---
 Documentation/scheduler/sched-ext.rst | 50 ++++++++++++++++++++++++---
 include/linux/sched/ext.h             |  7 ++--
 kernel/sched/ext.c                    | 49 ++++++++++++++++++--------
 3 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index c2b8dca57af53..3e1e0a4e974d7 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -77,18 +77,59 @@ optional. The following modified excerpt is from
 
 .. code-block:: c
 
-    s32 BPF_STRUCT_OPS(simple_init)
+    /*
+     * Decide which CPU a task should be migrated to before being
+     * enqueued (either at wakeup, fork time, or exec time). If an
+     * idle core is found by the default ops.select_cpu() implementation,
+     * then dispatch the task directly to SCX_DSQ_LOCAL and skip the
+     * ops.enqueue() callback.
+     *
+     * Note that this implemenation has exactly the same behavior as the
+     * default ops.select_cpu implementation. The behavior of the scheduler
+     * would be exactly same if the implementation just didn't define the
+     * simple_select_cpu() struct_ops prog.
+     */
+    s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p,
+                       s32 prev_cpu, u64 wake_flags)
     {
-            if (!switch_partial)
-                    scx_bpf_switch_all();
-            return 0;
+            s32 cpu;
+            /* Need to initialize or the BPF verifier will reject the program */
+            bool direct = false;
+
+            cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct);
+
+            if (direct)
+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+
+            return cpu;
     }
 
+    /*
+     * Do a direct dispatch of a task to the global DSQ. This ops.enqueue()
+     * callback will only be invoked if we failed to find a core to dispatch
+     * to in ops.select_cpu() above.
+     *
+     * Note that this implemenation has exactly the same behavior as the
+     * default ops.enqueue implementation, which just dispatches the task
+     * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same
+     * if the implementation just didn't define the simple_enqueue struct_ops
+     * prog.
+     */
     void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
     {
             scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
     }
 
+    s32 BPF_STRUCT_OPS(simple_init)
+    {
+            /*
+             * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should
+             * use sched_ext.
+             */
+            scx_bpf_switch_all();
+            return 0;
+    }
+
     void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
     {
             exit_type = ei->type;
@@ -96,6 +137,7 @@ optional. The following modified excerpt is from
 
     SEC(".struct_ops")
     struct sched_ext_ops simple_ops = {
+            .select_cpu             = (void *)simple_select_cpu,
             .enqueue                = (void *)simple_enqueue,
             .init                   = (void *)simple_init,
             .exit                   = (void *)simple_exit,
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 5096389ed89db..5a03363493924 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -639,11 +639,12 @@ enum scx_kf_mask {
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
 	SCX_KF_DISPATCH		= 1 << 3, /* ops.dispatch() */
 	SCX_KF_ENQUEUE		= 1 << 4, /* ops.enqueue() and ops.select_cpu() */
-	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
+	SCX_KF_SELECT_CPU	= 1 << 5, /* ops.select_cpu() */
+	SCX_KF_REST		= 1 << 6, /* other rq-locked operations */
 
 	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
-				  SCX_KF_ENQUEUE | SCX_KF_REST,
-	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_REST,
+				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
 /*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 796713aaff227..7b4825ec19aa8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -272,7 +272,7 @@ do {										\
  */
 #define SCX_CALL_OP_TASK(mask, op, task, args...)				\
 do {										\
-	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
+	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
 	SCX_CALL_OP(mask, op, task, ##args);					\
 	current->scx.kf_tasks[0] = NULL;					\
@@ -281,7 +281,7 @@ do {										\
 #define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
 ({										\
 	__typeof__(scx_ops.op(task, ##args)) __ret;				\
-	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
+	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
 	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
 	current->scx.kf_tasks[0] = NULL;					\
@@ -291,7 +291,7 @@ do {										\
 #define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
 ({										\
 	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
-	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
+	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
 	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
@@ -2013,10 +2013,13 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 		goto retry;
 }
 
-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+			      u64 wake_flags, bool *found)
 {
 	s32 cpu;
 
+	*found = false;
+
 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
 		scx_ops_error("built-in idle tracking is disabled");
 		return prev_cpu;
@@ -2030,7 +2033,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING)) {
 		cpu = smp_processor_id();
 		if (cpumask_test_cpu(cpu, p->cpus_ptr))
-			goto dispatch_local;
+			goto cpu_found;
 	}
 
 	if (p->nr_cpus_allowed == 1)
@@ -2044,30 +2047,41 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flag
 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
 		    test_and_clear_cpu_idle(prev_cpu)) {
 			cpu = prev_cpu;
-			goto dispatch_local;
+			goto cpu_found;
 		}
 
 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0)
-			goto dispatch_local;
+			goto cpu_found;
 	}
 
 	if (test_and_clear_cpu_idle(prev_cpu)) {
 		cpu = prev_cpu;
-		goto dispatch_local;
+		goto cpu_found;
 	}
 
 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
 	if (cpu >= 0)
-		goto dispatch_local;
+		goto cpu_found;
 
 	return prev_cpu;
 
-dispatch_local:
-	p->scx.ddsq_id = SCX_DSQ_LOCAL;
+cpu_found:
+	*found = true;
 	return cpu;
 }
 
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+			   bool *found)
+{
+	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
+		*found = false;
+		return prev_cpu;
+	}
+
+	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, found);
+}
+
 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
 {
 	if (SCX_HAS_OP(select_cpu)) {
@@ -2078,8 +2092,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		WARN_ON_ONCE(*ddsp_taskp);
 		*ddsp_taskp = p;
 
-		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE, select_cpu, p, prev_cpu,
-					   wake_flags);
+		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+					   select_cpu, p, prev_cpu, wake_flags);
 		*ddsp_taskp = NULL;
 		if (ops_cpu_valid(cpu)) {
 			return cpu;
@@ -2088,7 +2102,13 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 			return prev_cpu;
 		}
 	} else {
-		return scx_select_cpu_dfl(p, prev_cpu, wake_flags);
+		bool found;
+		s32 cpu;
+
+		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
+		if (found)
+			p->scx.ddsq_id = SCX_DSQ_LOCAL;
+		return cpu;
 	}
 }
 
@@ -4487,6 +4507,7 @@ BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
 BTF_SET8_END(scx_kfunc_ids_ops_only)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = {

From 9fd2c3bc5b669655fff7562f0017607fbe836795 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 22 Dec 2023 01:10:18 -0600
Subject: [PATCH 225/304] scx: Add selftests for new select_cpu dispatch
 semantics

Let's test the new semantics for being able to do direct dispatch from
ops.select_cpu(), including testing when SCX_OPS_ENQ_DFL_NO_DISPATCH is
specified. Also adds a testcase validating that we can't load a
scheduler with SCX_OPS_ENQ_LAST if ops.enqueue() is not defined.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/.gitignore        |  7 ++
 tools/testing/selftests/scx/Makefile          | 12 ++-
 .../selftests/scx/enq_last_no_enq_fails.bpf.c | 29 ++++++
 .../selftests/scx/enq_last_no_enq_fails.c     | 34 +++++++
 .../scx/enqueue_select_cpu_fails.bpf.c        | 54 +++++++++++
 .../selftests/scx/enqueue_select_cpu_fails.c  | 36 +++++++
 tools/testing/selftests/scx/scx_test.h        | 26 +++++
 .../selftests/scx/select_cpu_dfl.bpf.c        | 42 ++++++++
 tools/testing/selftests/scx/select_cpu_dfl.c  | 52 ++++++++++
 .../scx/select_cpu_dfl_nodispatch.bpf.c       | 97 +++++++++++++++++++
 .../selftests/scx/select_cpu_dfl_nodispatch.c | 52 ++++++++++
 .../selftests/scx/select_cpu_dispatch.bpf.c   | 49 ++++++++++
 .../selftests/scx/select_cpu_dispatch.c       | 52 ++++++++++
 .../scx/select_cpu_dispatch_bad_dsq.bpf.c     | 45 +++++++++
 .../scx/select_cpu_dispatch_bad_dsq.c         | 57 +++++++++++
 .../scx/select_cpu_dispatch_dbl_dsp.bpf.c     | 46 +++++++++
 .../scx/select_cpu_dispatch_dbl_dsp.c         | 57 +++++++++++
 17 files changed, 746 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c
 create mode 100644 tools/testing/selftests/scx/enq_last_no_enq_fails.c
 create mode 100644 tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c
 create mode 100644 tools/testing/selftests/scx/enqueue_select_cpu_fails.c
 create mode 100644 tools/testing/selftests/scx/scx_test.h
 create mode 100644 tools/testing/selftests/scx/select_cpu_dfl.bpf.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dfl.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch.bpf.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c

diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore
index 72fc34154e98b..8e5d7c1aab5b0 100644
--- a/tools/testing/selftests/scx/.gitignore
+++ b/tools/testing/selftests/scx/.gitignore
@@ -1,2 +1,9 @@
+enq_last_no_enq_fails
+enqueue_select_cpu_fails
 minimal
+select_cpu_dfl
+select_cpu_dfl_nodispatch
+select_cpu_dispatch
+select_cpu_dispatch_dbl_dsp
+select_cpu_dispatch_bad_dsq
 build/
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index c331cfc380b6c..3af9edc08c07f 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -42,6 +42,8 @@ SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
 BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
 LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a
 DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool
+HOST_BUILD_DIR := $(OBJ_DIR)
+HOST_OUTPUT_DIR := $(OUTPUT_DIR)
 
 VMLINUX_BTF_PATHS ?= ../../../../vmlinux					\
 		     /sys/kernel/btf/vmlinux					\
@@ -145,7 +147,15 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 ################
 # C schedulers #
 ################
-c-sched-targets := minimal
+c-sched-targets :=			\
+	minimal				\
+	select_cpu_dfl			\
+	select_cpu_dfl_nodispatch	\
+	select_cpu_dispatch		\
+	select_cpu_dispatch_dbl_dsp	\
+	select_cpu_dispatch_bad_dsq	\
+	enqueue_select_cpu_fails	\
+	enq_last_no_enq_fails
 
 $(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
 	$(eval sched=$(notdir $@))
diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c
new file mode 100644
index 0000000000000..4b0f84568dc15
--- /dev/null
+++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+s32 BPF_STRUCT_OPS(enq_last_no_enq_fails_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops enq_last_no_enq_fails_ops = {
+	.init			= enq_last_no_enq_fails_init,
+	.name			= "enq_last_no_enq_fails",
+	/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
+	.flags			= SCX_OPS_ENQ_LAST,
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.c
new file mode 100644
index 0000000000000..1f3d4d8adcc7f
--- /dev/null
+++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "enq_last_no_enq_fails.bpf.skel.h"
+#include "scx_test.h"
+
+int main(int argc, char **argv)
+{
+	struct enq_last_no_enq_fails *skel;
+	struct bpf_link *link;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = enq_last_no_enq_fails__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
+	SCX_BUG_ON(link, "Succeeded in attaching struct_ops");
+
+	bpf_link__destroy(link);
+	enq_last_no_enq_fails__destroy(skel);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c b/tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c
new file mode 100644
index 0000000000000..61f04fa4ce2b3
--- /dev/null
+++ b/tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* Manually specify the signature until the kfunc is added to the scx repo. */
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+			   bool *found) __ksym;
+
+s32 BPF_STRUCT_OPS(enqueue_select_cpu_fails_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(enqueue_select_cpu_fails_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	/*
+	 * Need to initialize the variable or the verifier will fail to load.
+	 * Improving these semantics is actively being worked on.
+	 */
+	bool found = false;
+
+	/* Can only call from ops.select_cpu() */
+	scx_bpf_select_cpu_dfl(p, 0, 0, &found);
+
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+s32 BPF_STRUCT_OPS(enqueue_select_cpu_fails_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops enqueue_select_cpu_fails_ops = {
+	.select_cpu		= enqueue_select_cpu_fails_select_cpu,
+	.enqueue		= enqueue_select_cpu_fails_enqueue,
+	.init			= enqueue_select_cpu_fails_init,
+	.name			= "enqueue_select_cpu_fails",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/enqueue_select_cpu_fails.c b/tools/testing/selftests/scx/enqueue_select_cpu_fails.c
new file mode 100644
index 0000000000000..f45740370f508
--- /dev/null
+++ b/tools/testing/selftests/scx/enqueue_select_cpu_fails.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "enqueue_select_cpu_fails.bpf.skel.h"
+#include "scx_test.h"
+
+int main(int argc, char **argv)
+{
+	struct enqueue_select_cpu_fails *skel;
+	struct bpf_link *link;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = enqueue_select_cpu_fails__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.enqueue_select_cpu_fails_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+
+	bpf_link__destroy(link);
+	enqueue_select_cpu_fails__destroy(skel);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/scx_test.h b/tools/testing/selftests/scx/scx_test.h
new file mode 100644
index 0000000000000..6a61763b19ab5
--- /dev/null
+++ b/tools/testing/selftests/scx/scx_test.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+
+#ifndef __SCX_TEST_H__
+#define __SCX_TEST_H__
+
+#include <scx/common.h>
+
+#define SCX_GT(_x, _y) SCX_BUG_ON((_x) <= (_y), "Expected %s > %s (%lu > %lu)",		\
+				  #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_GE(_x, _y) SCX_BUG_ON((_x) < (_y), "Expected %s >= %s (%lu >= %lu)",	\
+				  #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_LT(_x, _y) SCX_BUG_ON((_x) >= (_y), "Expected %s < %s (%lu < %lu)",		\
+				  #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_LE(_x, _y) SCX_BUG_ON((_x) > (_y), "Expected %s <= %s (%lu <= %lu)",	\
+				  #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_EQ(_x, _y) SCX_BUG_ON((_x) != (_y), "Expected %s == %s (%lu == %lu)",	\
+				  #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_ASSERT(_x) SCX_BUG_ON(!(_x), "Expected %s to be true (%lu)",		\
+				  #_x, (u64)(_x))
+
+#endif  // # __SCX_TEST_H__
diff --git a/tools/testing/selftests/scx/select_cpu_dfl.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c
new file mode 100644
index 0000000000000..091bf1ed9bec0
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+bool saw_local = false;
+
+void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask();
+
+	if (p->nr_cpus_allowed > 1 &&
+	    bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask))
+		saw_local = true;
+	scx_bpf_put_idle_cpumask(idle_mask);
+
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dfl_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dfl_ops = {
+	.enqueue		= select_cpu_dfl_enqueue,
+	.init			= select_cpu_dfl_init,
+	.name			= "select_cpu_dfl",
+};
diff --git a/tools/testing/selftests/scx/select_cpu_dfl.c b/tools/testing/selftests/scx/select_cpu_dfl.c
new file mode 100644
index 0000000000000..2962be1bec518
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dfl.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "select_cpu_dfl.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+
+int main(int argc, char **argv)
+{
+	struct select_cpu_dfl *skel;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = select_cpu_dfl__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	SCX_ASSERT(!skel->bss->saw_local);
+	bpf_link__destroy(link);
+	select_cpu_dfl__destroy(skel);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c
new file mode 100644
index 0000000000000..9d026e0cbdbb4
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag
+ * specified.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+bool saw_local = false;
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool	force_local;	/* CPU changed by ops.select_cpu() */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/* Manually specify the signature until the kfunc is added to the scx repo. */
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+			   bool *found) __ksym;
+
+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+	s32 cpu;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return -ESRCH;
+	}
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags,
+				     &tctx->force_local);
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	u64 dsq_id = SCX_DSQ_GLOBAL;
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	if (tctx->force_local) {
+		dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		saw_local = true;
+	}
+
+	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_prep_enable,
+		   struct task_struct *p, struct scx_enable_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
+	.select_cpu		= select_cpu_dfl_nodispatch_select_cpu,
+	.enqueue		= select_cpu_dfl_nodispatch_enqueue,
+	.prep_enable		= select_cpu_dfl_nodispatch_prep_enable,
+	.init			= select_cpu_dfl_nodispatch_init,
+	.name			= "select_cpu_dfl_nodispatch",
+};
diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c
new file mode 100644
index 0000000000000..3121b28c81ed0
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "select_cpu_dfl_nodispatch.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+
+int main(int argc, char **argv)
+{
+	struct select_cpu_dfl_nodispatch *skel;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = select_cpu_dfl_nodispatch__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	SCX_ASSERT(skel->bss->saw_local);
+	bpf_link__destroy(link);
+	select_cpu_dfl_nodispatch__destroy(skel);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c
new file mode 100644
index 0000000000000..0fda977697251
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	u64 dsq_id = SCX_DSQ_LOCAL;
+	s32 cpu = prev_cpu;
+
+	if (scx_bpf_test_and_clear_cpu_idle(cpu))
+		goto dispatch;
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0)
+		goto dispatch;
+
+	dsq_id = SCX_DSQ_GLOBAL;
+	cpu = prev_cpu;
+
+dispatch:
+	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0);
+	return cpu;
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dispatch_ops = {
+	.select_cpu		= select_cpu_dispatch_select_cpu,
+	.init			= select_cpu_dispatch_init,
+	.name			= "select_cpu_dispatch",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.c b/tools/testing/selftests/scx/select_cpu_dispatch.c
new file mode 100644
index 0000000000000..a3625f75db720
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dispatch.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "select_cpu_dispatch.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+
+int main(int argc, char **argv)
+{
+	struct select_cpu_dispatch *skel;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = select_cpu_dispatch__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+
+	bpf_link__destroy(link);
+	select_cpu_dispatch__destroy(skel);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c
new file mode 100644
index 0000000000000..c9105add924d5
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct user_exit_info uei;
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Dispatching to a random DSQ should fail. */
+	scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0);
+
+	return prev_cpu;
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
+	.select_cpu		= select_cpu_dispatch_bad_dsq_select_cpu,
+	.init			= select_cpu_dispatch_bad_dsq_init,
+	.exit			= select_cpu_dispatch_bad_dsq_exit,
+	.name			= "select_cpu_dispatch_bad_dsq",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
new file mode 100644
index 0000000000000..f1094e3645d61
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "select_cpu_dispatch_bad_dsq.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+#define SCX_EXIT_ERROR 1024
+
+int main(int argc, char **argv)
+{
+	struct select_cpu_dispatch_bad_dsq *skel;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = select_cpu_dispatch_bad_dsq__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	/*
+	 * The scheduler is expected to gracefully exit after bad_dsqoneously
+	 * double-dispatching from ops.selec_cpu().
+	 */
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR);
+	bpf_link__destroy(link);
+	select_cpu_dispatch_bad_dsq__destroy(skel);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c
new file mode 100644
index 0000000000000..82d8148399f28
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates the behavior of direct dispatching with a default
+ * select_cpu implementation.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct user_exit_info uei;
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/* Dispatching twice in a row is disallowed. */
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+
+	return prev_cpu;
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
+	.select_cpu		= select_cpu_dispatch_dbl_dsp_select_cpu,
+	.init			= select_cpu_dispatch_dbl_dsp_init,
+	.exit			= select_cpu_dispatch_dbl_dsp_exit,
+	.name			= "select_cpu_dispatch_dbl_dsp",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
new file mode 100644
index 0000000000000..9736b65f79bd0
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_CHILDREN 1028
+#define SCX_EXIT_ERROR 1024
+
+int main(int argc, char **argv)
+{
+	struct select_cpu_dispatch_dbl_dsp *skel;
+	struct bpf_link *link;
+	pid_t pids[NUM_CHILDREN];
+	int i, status;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = select_cpu_dispatch_dbl_dsp__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	/*
+	 * The scheduler is expected to gracefully exit after
+	 * double-dispatching from ops.select_cpu().
+	 */
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0) {
+			sleep(1);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < NUM_CHILDREN; i++) {
+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
+		SCX_EQ(status, 0);
+	}
+
+	SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR);
+	bpf_link__destroy(link);
+	select_cpu_dispatch_dbl_dsp__destroy(skel);
+
+	return 0;
+}

From 2638affbd2ac8a5cb6ef4705d5797c1e98f7f366 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 4 Jan 2024 16:19:13 -0600
Subject: [PATCH 226/304] scx: Error for a priq builtin DSQ in
 dispatch_enqueue()

We're currently checking whether a builtin DSQ is being used with priq
in scx_bpf_dispatch_vtime(). This neglects the fact that we could end up
falling back to scx_dsq_global if there's an error. If we error out with
SCX_ENQ_DSQ_PRIQ set in enqueue flags, we would trigger a warning in
dispatch_enqueue(). Let's instead just move the check to inside of
dispatch_enqueue().

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7b4825ec19aa8..9ff5f208d6a64 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -646,8 +646,20 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		}
 	}
 
+	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
+		     (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
+		/*
+		 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
+		 * their FIFO queues. To avoid confusion and accidentally
+		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
+		 * disallow any internal DSQ from doing vtime ordering of
+		 * tasks.
+		 */
+		scx_ops_error("Cannot use vtime ordering for built-in DSQs");
+		enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
+	}
+
 	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
-		WARN_ON_ONCE(dsq->id & SCX_DSQ_FLAG_BUILTIN);
 		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
 		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
 			      scx_dsq_priq_less);
@@ -4015,17 +4027,6 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice,
 	if (!scx_dispatch_preamble(p, enq_flags))
 		return;
 
-	/*
-	 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from their FIFO
-	 * queues. To avoid confusion and accidentally starving
-	 * vtime-dispatched tasks by FIFO-dispatched tasks, we disallow any
-	 * internal DSQ from doing vtime ordering of tasks.
-	 */
-	if (dsq_id & SCX_DSQ_FLAG_BUILTIN) {
-		scx_ops_error("Cannot use vtime ordering for built-in DSQs");
-		return;
-	}
-
 	if (slice)
 		p->scx.slice = slice;
 	else

From d5b84a488ca14b912b73c17a41d508f7fd970601 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 4 Jan 2024 16:28:57 -0600
Subject: [PATCH 227/304] scx: Add testcases for vtime-dispatching to builtin
 DSQs

Let's verify that we're disallowing builtin DSQs from being dispatched
to.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/.gitignore        |  2 +
 tools/testing/selftests/scx/Makefile          |  4 +-
 .../selftests/scx/dsp_fallbackdsq_fail.bpf.c  | 42 +++++++++++++++++++
 .../selftests/scx/dsp_fallbackdsq_fail.c      | 36 ++++++++++++++++
 4 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c
 create mode 100644 tools/testing/selftests/scx/dsp_fallbackdsq_fail.c

diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore
index 8e5d7c1aab5b0..ab806b18d9dba 100644
--- a/tools/testing/selftests/scx/.gitignore
+++ b/tools/testing/selftests/scx/.gitignore
@@ -1,3 +1,5 @@
+dsp_fallbackdsq_fail
+dsp_localdsq_fail
 enq_last_no_enq_fails
 enqueue_select_cpu_fails
 minimal
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index 3af9edc08c07f..e993335a22e0c 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -155,7 +155,9 @@ c-sched-targets :=			\
 	select_cpu_dispatch_dbl_dsp	\
 	select_cpu_dispatch_bad_dsq	\
 	enqueue_select_cpu_fails	\
-	enq_last_no_enq_fails
+	enq_last_no_enq_fails		\
+	dsp_localdsq_fail		\
+	dsp_fallbackdsq_fail
 
 $(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
 	$(eval sched=$(notdir $@))
diff --git a/tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c b/tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c
new file mode 100644
index 0000000000000..d15ad9b0b2c35
--- /dev/null
+++ b/tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+s32 BPF_STRUCT_OPS(dsp_fallbackdsq_fail_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+
+	if (cpu >= 0) {
+		/*
+		 * If we dispatch to a bogus DSQ that will fall back to the
+		 * builtin global DSQ, we fail gracefully.
+		 */
+		scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
+				       p->scx.dsq_vtime, 0);
+		return cpu;
+	}
+
+	return prev_cpu;
+}
+
+s32 BPF_STRUCT_OPS(dsp_fallbackdsq_fail_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dsp_fallbackdsq_fail_ops = {
+	.select_cpu		= dsp_fallbackdsq_fail_select_cpu,
+	.init			= dsp_fallbackdsq_fail_init,
+	.name			= "dsp_fallbackdsq_fail",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/dsp_fallbackdsq_fail.c b/tools/testing/selftests/scx/dsp_fallbackdsq_fail.c
new file mode 100644
index 0000000000000..fd70cd89d9d06
--- /dev/null
+++ b/tools/testing/selftests/scx/dsp_fallbackdsq_fail.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "dsp_fallbackdsq_fail.bpf.skel.h"
+#include "scx_test.h"
+
+int main(int argc, char **argv)
+{
+	struct dsp_fallbackdsq_fail *skel;
+	struct bpf_link *link;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = dsp_fallbackdsq_fail__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.dsp_fallbackdsq_fail_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+
+	bpf_link__destroy(link);
+	dsp_fallbackdsq_fail__destroy(skel);
+
+	return 0;
+}

From 56b2ec9383e88f07f874ea853b62ee0771524af2 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 13 Dec 2023 15:03:51 -0600
Subject: [PATCH 228/304] scx: Always set task scx weight before enable

We were previously only calling it on the fork path, but we need to be
calling it on the enable path as well.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9ff5f208d6a64..dab874f0b4626 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2332,6 +2332,13 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 	return 0;
 }
 
+static void set_task_scx_weight(struct task_struct *p)
+{
+	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+
+	p->scx.weight = sched_weight_to_cgroup(weight);
+}
+
 static void scx_ops_enable_task(struct task_struct *p)
 {
 	lockdep_assert_rq_held(task_rq(p));
@@ -2341,6 +2348,15 @@ static void scx_ops_enable_task(struct task_struct *p)
 		struct scx_enable_args args = {
 			SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
 		};
+
+		/*
+		 * Set the weight manually before calling ops.enable() so that
+		 * the scheduler doesn't see a stale value if they inspect the
+		 * task struct. ops.set_weight() is invoked afterwards in the
+		 * caller, as it would be odd to receive a callback on the task
+		 * before we tell the scheduler that it's been fully enabled.
+		 */
+		set_task_scx_weight(p);
 		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args);
 	}
 	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
@@ -2366,13 +2382,6 @@ static void scx_ops_disable_task(struct task_struct *p)
 	}
 }
 
-static void set_task_scx_weight(struct task_struct *p)
-{
-	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
-
-	p->scx.weight = sched_weight_to_cgroup(weight);
-}
-
 /**
  * refresh_scx_weight - Refresh a task's ext weight
  * @p: task to refresh ext weight for
@@ -2419,14 +2428,6 @@ void scx_post_fork(struct task_struct *p)
 		struct rq *rq;
 
 		rq = task_rq_lock(p, &rf);
-		/*
-		 * Set the weight manually before calling ops.enable() so that
-		 * the scheduler doesn't see a stale value if they inspect the
-		 * task struct. We'll invoke ops.set_weight() afterwards, as it
-		 * would be odd to receive a callback on the task before we
-		 * tell the scheduler that it's been fully enabled.
-		 */
-		set_task_scx_weight(p);
 		scx_ops_enable_task(p);
 		refresh_scx_weight(p);
 		task_rq_unlock(rq, p, &rf);

From 960444173886e837ab7d82b81767802c9ed430d8 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 14 Dec 2023 13:26:15 -0600
Subject: [PATCH 229/304] scx: Call enable / disable on entry / exit to scx

Currently, the ops.enable() and ops.disable() callbacks are invoked a
single time for every task on the system. ops.enable() is invoked
shortly after a task succeeds in ops.prep_enable(), and ops.disable() is
invoked when a task exits, or when the BPF scheduler is unloaded.

This API is a bit odd because ops.enable() can be invoked well before a
task actually starts running in the BPF scheduler, so it's not
necessarily useful as a way to bootstrap a process. For example,
scx_simple does the following:

void BPF_STRUCT_OPS(simple_enable, struct task_struct *p,
                    struct scx_enable_args *args)
{
        p->scx.dsq_vtime = vtime_now;
}

If the task later switches to sched_ext, the value will of course be
stale. While it ends up balancing out due to logic elsewhere in the
scheduler, it's indicative of a somewhat awkward component of the API
that can be improved.

Instead, this patch has ops.enable() be invoked when a task is entering
the scheduler for the first time, and and ops.disable() be invoked
whenever a task is leaving the scheduler; be it because of exiting, the
scheduler being unloaded, or the task manually switching sched policies.

Signed-off-by: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h |  36 ++++++---
 init/init_task.c          |   1 +
 kernel/sched/ext.c        | 162 +++++++++++++++++++++++++-------------
 3 files changed, 136 insertions(+), 63 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 5a03363493924..732ce680131cc 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -431,11 +431,11 @@ struct sched_ext_ops {
 	 *
 	 * Either we're loading a BPF scheduler or a new task is being forked.
 	 * Prepare BPF scheduling for @p. This operation may block and can be
-	 * used for allocations.
+	 * used for allocations, and is called exactly once for a task.
 	 *
 	 * Return 0 for success, -errno for failure. An error return while
-	 * loading will abort loading of the BPF scheduler. During a fork, will
-	 * abort the specific fork.
+	 * loading will abort loading of the BPF scheduler. During a fork, it
+	 * will abort that specific fork.
 	 */
 	s32 (*prep_enable)(struct task_struct *p, struct scx_enable_args *args);
 
@@ -444,8 +444,9 @@ struct sched_ext_ops {
 	 * @p: task to enable BPF scheduling for
 	 * @args: enable arguments, see the struct definition
 	 *
-	 * Enable @p for BPF scheduling. @p is now in the cgroup specified for
-	 * the preceding prep_enable() and will start running soon.
+	 * Enable @p for BPF scheduling. @p is now in the cgroup specified in
+	 * @args. enable() is called on @p any time it enters SCX, and is
+	 * always paired with a matching disable().
 	 */
 	void (*enable)(struct task_struct *p, struct scx_enable_args *args);
 
@@ -465,7 +466,8 @@ struct sched_ext_ops {
 	 * @p: task to disable BPF scheduling for
 	 *
 	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
-	 * Disable BPF scheduling for @p.
+	 * Disable BPF scheduling for @p. A disable() call is always matched
+	 * with a prior enable() call.
 	 */
 	void (*disable)(struct task_struct *p);
 
@@ -606,14 +608,15 @@ enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
 	SCX_TASK_DDSP_PRIQ	= 1 << 2, /* task should be enqueued on priq when directly dispatched */
-
-	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
-	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
+	SCX_TASK_STATE_0	= 1 << 3, /* first bit encoding the task's current state */
+	SCX_TASK_STATE_1	= 1 << 4, /* second bit encoding the task's current state */
 
 	SCX_TASK_WATCHDOG_RESET = 1 << 16, /* task watchdog counter should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 17, /* last dequeue was for SLEEP */
 
 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
+
+	SCX_TASK_STATE_MASK	= SCX_TASK_STATE_0 | SCX_TASK_STATE_1,
 };
 
 /* scx_entity.dsq_flags */
@@ -647,6 +650,21 @@ enum scx_kf_mask {
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
+/* scx_entity.task_state */
+enum scx_task_state {
+	/* ops.prep_enable() has not yet been called on task */
+	SCX_TASK_NONE,
+
+	/* ops.prep_enable() succeeded on task, but it still be cancelled */
+	SCX_TASK_INIT,
+
+	/* Task is fully initialized, but not being scheduled in sched_ext */
+	SCX_TASK_READY,
+
+	/* Task is fully initialized and is being scheduled in sched_ext */
+	SCX_TASK_ENABLED,
+};
+
 /*
  * The following is embedded in task_struct and contains all fields necessary
  * for a task to be scheduled by SCX.
diff --git a/init/init_task.c b/init/init_task.c
index 56c49c02d830f..c19041286b76c 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -108,6 +108,7 @@ struct task_struct init_task
 	.scx		= {
 		.dsq_node.fifo	= LIST_HEAD_INIT(init_task.scx.dsq_node.fifo),
 		.watchdog_node	= LIST_HEAD_INIT(init_task.scx.watchdog_node),
+		.flags		= 0,
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
 		.ops_state	= ATOMIC_INIT(0),
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dab874f0b4626..55cc5f5c9a85f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2287,12 +2287,49 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
 
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
+static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+{
+	int state = p->scx.flags & SCX_TASK_STATE_MASK;
+
+	switch (state) {
+	case SCX_TASK_STATE_0 | SCX_TASK_STATE_1:
+		return SCX_TASK_ENABLED;
+	case SCX_TASK_STATE_1:
+		return SCX_TASK_READY;
+	case SCX_TASK_STATE_0:
+		return SCX_TASK_INIT;
+	default:
+		return SCX_TASK_NONE;
+	}
+}
+
+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
+{
+	enum scx_task_state prev_state = scx_get_task_state(p);
+
+	p->scx.flags &= ~SCX_TASK_STATE_MASK;
+	switch (state) {
+	case SCX_TASK_NONE:
+		return;
+	case SCX_TASK_INIT:
+		WARN_ON_ONCE(prev_state != SCX_TASK_NONE);
+		p->scx.flags |= SCX_TASK_STATE_0;
+		return;
+	case SCX_TASK_READY:
+		WARN_ON_ONCE(prev_state == SCX_TASK_NONE);
+		p->scx.flags |= SCX_TASK_STATE_1;
+		return;
+	case SCX_TASK_ENABLED:
+		WARN_ON_ONCE(prev_state != SCX_TASK_READY);
+		p->scx.flags |= (SCX_TASK_STATE_0 | SCX_TASK_STATE_1);
+		return;
+	}
+}
+
 static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 {
 	int ret;
 
-	WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED);
-
 	p->scx.disallow = false;
 
 	if (SCX_HAS_OP(prep_enable)) {
@@ -2307,6 +2344,8 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		}
 	}
 
+	scx_set_task_state(p, SCX_TASK_INIT);
+
 	if (p->scx.disallow) {
 		struct rq *rq;
 		struct rq_flags rf;
@@ -2328,7 +2367,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		task_rq_unlock(rq, p, &rf);
 	}
 
-	p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
+	p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
 	return 0;
 }
 
@@ -2342,62 +2381,58 @@ static void set_task_scx_weight(struct task_struct *p)
 static void scx_ops_enable_task(struct task_struct *p)
 {
 	lockdep_assert_rq_held(task_rq(p));
-	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
 
+	/*
+	 * Set the weight before calling ops.enable() so that the scheduler
+	 * doesn't see a stale value if they inspect the task struct.
+	 */
+	set_task_scx_weight(p);
 	if (SCX_HAS_OP(enable)) {
 		struct scx_enable_args args = {
 			SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
 		};
 
-		/*
-		 * Set the weight manually before calling ops.enable() so that
-		 * the scheduler doesn't see a stale value if they inspect the
-		 * task struct. ops.set_weight() is invoked afterwards in the
-		 * caller, as it would be odd to receive a callback on the task
-		 * before we tell the scheduler that it's been fully enabled.
-		 */
-		set_task_scx_weight(p);
 		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args);
 	}
-	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
-	p->scx.flags |= SCX_TASK_OPS_ENABLED;
+	scx_set_task_state(p, SCX_TASK_ENABLED);
+
+	if (SCX_HAS_OP(set_weight))
+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
 }
 
 static void scx_ops_disable_task(struct task_struct *p)
 {
 	lockdep_assert_rq_held(task_rq(p));
+	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
 
-	if (p->scx.flags & SCX_TASK_OPS_PREPPED) {
+	if (SCX_HAS_OP(disable))
+		SCX_CALL_OP(SCX_KF_REST, disable, p);
+	scx_set_task_state(p, SCX_TASK_READY);
+}
+
+static void scx_ops_exit_task(struct task_struct *p)
+{
+	lockdep_assert_rq_held(task_rq(p));
+
+	switch (scx_get_task_state(p)) {
+	case SCX_TASK_NONE:
+		return;
+	case SCX_TASK_INIT:
 		if (SCX_HAS_OP(cancel_enable)) {
 			struct scx_enable_args args = {
 				SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
 			};
 			SCX_CALL_OP(SCX_KF_REST, cancel_enable, p, &args);
 		}
-		p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
-	} else if (p->scx.flags & SCX_TASK_OPS_ENABLED) {
-		if (SCX_HAS_OP(disable))
-			SCX_CALL_OP(SCX_KF_REST, disable, p);
-		p->scx.flags &= ~SCX_TASK_OPS_ENABLED;
+		break;
+	case SCX_TASK_READY:
+		break;
+	case SCX_TASK_ENABLED:
+		scx_ops_disable_task(p);
+		break;
 	}
-}
 
-/**
- * refresh_scx_weight - Refresh a task's ext weight
- * @p: task to refresh ext weight for
- *
- * @p->scx.weight carries the task's static priority in cgroup weight scale to
- * enable easy access from the BPF scheduler. To keep it synchronized with the
- * current task priority, this function should be called when a new task is
- * created, priority is changed for a task on sched_ext, and a task is switched
- * to sched_ext from other classes.
- */
-static void refresh_scx_weight(struct task_struct *p)
-{
-	lockdep_assert_rq_held(task_rq(p));
-	set_task_scx_weight(p);
-	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+	scx_set_task_state(p, SCX_TASK_NONE);
 }
 
 void scx_pre_fork(struct task_struct *p)
@@ -2424,13 +2459,20 @@ int scx_fork(struct task_struct *p)
 void scx_post_fork(struct task_struct *p)
 {
 	if (scx_enabled()) {
-		struct rq_flags rf;
-		struct rq *rq;
+		scx_set_task_state(p, SCX_TASK_READY);
+		/*
+		 * Enable the task immediately if it's running on sched_ext.
+		 * Otherwise, it'll be enabled in switching_to_scx() if and
+		 * when it's ever configured to run with a SCHED_EXT policy.
+		 */
+		if (p->sched_class == &ext_sched_class) {
+			struct rq_flags rf;
+			struct rq *rq;
 
-		rq = task_rq_lock(p, &rf);
-		scx_ops_enable_task(p);
-		refresh_scx_weight(p);
-		task_rq_unlock(rq, p, &rf);
+			rq = task_rq_lock(p, &rf);
+			scx_ops_enable_task(p);
+			task_rq_unlock(rq, p, &rf);
+		}
 	}
 
 	spin_lock_irq(&scx_tasks_lock);
@@ -2442,8 +2484,10 @@ void scx_post_fork(struct task_struct *p)
 
 void scx_cancel_fork(struct task_struct *p)
 {
-	if (scx_enabled())
-		scx_ops_disable_task(p);
+	if (scx_enabled()) {
+		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
+		scx_ops_exit_task(p);
+	}
 	percpu_up_read(&scx_fork_rwsem);
 }
 
@@ -2456,22 +2500,26 @@ void sched_ext_free(struct task_struct *p)
 	spin_unlock_irqrestore(&scx_tasks_lock, flags);
 
 	/*
-	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s PREPPED ->
+	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
 	 * ENABLED transitions can't race us. Disable ops for @p.
 	 */
-	if (p->scx.flags & (SCX_TASK_OPS_PREPPED | SCX_TASK_OPS_ENABLED)) {
+	if (scx_get_task_state(p) != SCX_TASK_NONE) {
 		struct rq_flags rf;
 		struct rq *rq;
 
 		rq = task_rq_lock(p, &rf);
-		scx_ops_disable_task(p);
+		scx_ops_exit_task(p);
 		task_rq_unlock(rq, p, &rf);
 	}
 }
 
 static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
 {
-	refresh_scx_weight(p);
+	lockdep_assert_rq_held(task_rq(p));
+
+	set_task_scx_weight(p);
+	if (SCX_HAS_OP(set_weight))
+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
 }
 
 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -2480,7 +2528,7 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
 
 static void switching_to_scx(struct rq *rq, struct task_struct *p)
 {
-	refresh_scx_weight(p);
+	scx_ops_enable_task(p);
 
 	/*
 	 * set_cpus_allowed_scx() is not called while @p is associated with a
@@ -2491,6 +2539,11 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 				 (struct cpumask *)p->cpus_ptr);
 }
 
+static void switched_from_scx(struct rq *rq, struct task_struct *p)
+{
+	scx_ops_disable_task(p);
+}
+
 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
@@ -2705,7 +2758,7 @@ static inline void scx_cgroup_unlock(void) {}
  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
  *   their current sched_class. Call them directly from sched core instead.
  *
- * - task_woken, switched_from: Unnecessary.
+ * - task_woken: Unnecessary.
  */
 DEFINE_SCHED_CLASS(ext) = {
 	.enqueue_task		= enqueue_task_scx,
@@ -2736,6 +2789,7 @@ DEFINE_SCHED_CLASS(ext) = {
 	.task_tick		= task_tick_scx,
 
 	.switching_to		= switching_to_scx,
+	.switched_from		= switched_from_scx,
 	.switched_to		= switched_to_scx,
 	.reweight_task		= reweight_task_scx,
 	.prio_changed		= prio_changed_scx,
@@ -3124,7 +3178,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		if (alive)
 			check_class_changed(task_rq(p), p, old_class, p->prio);
 
-		scx_ops_disable_task(p);
+		scx_ops_exit_task(p);
 	}
 	scx_task_iter_exit(&sti);
 	spin_unlock_irq(&scx_tasks_lock);
@@ -3445,7 +3499,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
 					       &ctx);
 
-			scx_ops_enable_task(p);
+			scx_set_task_state(p, SCX_TASK_READY);
 			__setscheduler_prio(p, p->prio);
 			check_class_changing(task_rq(p), p, old_class);
 
@@ -3453,7 +3507,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 			check_class_changed(task_rq(p), p, old_class, p->prio);
 		} else {
-			scx_ops_disable_task(p);
+			scx_ops_exit_task(p);
 		}
 	}
 	scx_task_iter_exit(&sti);

From 81e1051116ce50b3b9c99ed3de41927cdb981e77 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 14 Dec 2023 15:47:03 -0600
Subject: [PATCH 230/304] scx: Rename prep_enable() and cancel_enable(), add
 exit_task()

ops.prep_enable() and ops.cancel_enable() have become arguably somewhat
misnomers in that ops.enable() and ops.disable() may be called multiple
times throughout a BPF prog being loaded, but ops.prep_enable() and
ops.cancel_enable() will be called at most once. ops.prep_enable() is
really more akin to initializing the task rather than preparing for
ops.enable(), so let's rename it to ops.init_task() and
ops.cancel_init() to reflect this.

In addition, some schedulers are currently using ops.disable() to clean
up whatever was initialized in (what was previously) ops.prep_enable().
This doesn't work now that ops.disable() can be called multiple times,
so we also need to add a new callback called exit_task() which is called
exactly once when a task is exiting (if it was previously successfully
initialized).

Signed-off-by: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h | 53 +++++++++++++++++---------------
 kernel/sched/ext.c        | 63 ++++++++++++++++++---------------------
 2 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 732ce680131cc..73bcb9292954f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -123,14 +123,20 @@ enum scx_ops_flags {
 				  SCX_OPS_CGROUP_KNOB_WEIGHT,
 };
 
-/* argument container for ops.enable() and friends */
-struct scx_enable_args {
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
 #ifdef CONFIG_EXT_GROUP_SCHED
 	/* the cgroup the task is joining */
 	struct cgroup		*cgroup;
 #endif
 };
 
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+	/* Whether the task exited before running on sched_ext. */
+	bool cancelled;
+};
+
 /* argument container for ops->cgroup_init() */
 struct scx_cgroup_init_args {
 	/* the weight of the cgroup [1..10000] */
@@ -425,41 +431,40 @@ struct sched_ext_ops {
 	void (*cpu_offline)(s32 cpu);
 
 	/**
-	 * prep_enable - Prepare to enable BPF scheduling for a task
-	 * @p: task to prepare BPF scheduling for
-	 * @args: enable arguments, see the struct definition
+	 * init_task - Initialize a task to run in a BPF scheduler
+	 * @p: task to initialize for BPF scheduling
+	 * @args: init arguments, see the struct definition
 	 *
 	 * Either we're loading a BPF scheduler or a new task is being forked.
-	 * Prepare BPF scheduling for @p. This operation may block and can be
-	 * used for allocations, and is called exactly once for a task.
+	 * Initialize @p for BPF scheduling. This operation may block and can
+	 * be used for allocations, and is called exactly once for a task.
 	 *
 	 * Return 0 for success, -errno for failure. An error return while
 	 * loading will abort loading of the BPF scheduler. During a fork, it
 	 * will abort that specific fork.
 	 */
-	s32 (*prep_enable)(struct task_struct *p, struct scx_enable_args *args);
+	s32 (*init_task)(struct task_struct *p,
+			 struct scx_init_task_args *args);
+
+	/**
+	 * exit_task - Exit a previously-running task from the system
+	 * @p: task to exit
+	 *
+	 * @p is exiting or the BPF scheduler is being unloaded. Perform any
+	 * necessary cleanup for @p.
+	 */
+	void (*exit_task)(struct task_struct *p,
+			  struct scx_exit_task_args *args);
 
 	/**
 	 * enable - Enable BPF scheduling for a task
 	 * @p: task to enable BPF scheduling for
-	 * @args: enable arguments, see the struct definition
 	 *
 	 * Enable @p for BPF scheduling. @p is now in the cgroup specified in
 	 * @args. enable() is called on @p any time it enters SCX, and is
 	 * always paired with a matching disable().
 	 */
-	void (*enable)(struct task_struct *p, struct scx_enable_args *args);
-
-	/**
-	 * cancel_enable - Cancel prep_enable()
-	 * @p: task being canceled
-	 * @args: enable arguments, see the struct definition
-	 *
-	 * @p was prep_enable()'d but failed before reaching enable(). Undo the
-	 * preparation.
-	 */
-	void (*cancel_enable)(struct task_struct *p,
-			      struct scx_enable_args *args);
+	void (*enable)(struct task_struct *p);
 
 	/**
 	 * disable - Disable BPF scheduling for a task
@@ -722,11 +727,11 @@ struct sched_ext_entity {
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.
 	 *
-	 * If set from ops.prep_enable() and the task's policy is already
+	 * If set from ops.init_task() and the task's policy is already
 	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
 	 * or by inhering the parent's policy during fork, the task's policy is
-	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of such
-	 * events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
+	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
 	 */
 	bool			disallow;	/* reject switching into SCX */
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 55cc5f5c9a85f..cace32cbd8379 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -535,9 +535,9 @@ static bool ops_cpu_valid(s32 cpu)
  * @err: -errno value to sanitize
  *
  * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
- * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
- * cause misbehaviors. For an example, a large negative return from
- * ops.prep_enable() triggers an oops when passed up the call chain because the
+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain
+ * can cause misbehaviors. For an example, a large negative return from
+ * ops.init_task() triggers an oops when passed up the call chain because the
  * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
  * handled as a pointer.
  */
@@ -2279,11 +2279,11 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
 		return &cgrp_dfl_root.cgrp;
 }
 
-#define SCX_ENABLE_ARGS_INIT_CGROUP(tg)		.cgroup = tg_cgrp(tg),
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)		.cgroup = tg_cgrp(tg),
 
 #else	/* CONFIG_EXT_GROUP_SCHED */
 
-#define SCX_ENABLE_ARGS_INIT_CGROUP(tg)
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)
 
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
@@ -2326,20 +2326,20 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
 	}
 }
 
-static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
+static int scx_ops_init_task(struct task_struct *p, struct task_group *tg)
 {
 	int ret;
 
 	p->scx.disallow = false;
 
-	if (SCX_HAS_OP(prep_enable)) {
-		struct scx_enable_args args = {
-			SCX_ENABLE_ARGS_INIT_CGROUP(tg)
+	if (SCX_HAS_OP(init_task)) {
+		struct scx_init_task_args args = {
+			SCX_INIT_TASK_ARGS_CGROUP(tg)
 		};
 
-		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, prep_enable, p, &args);
+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args);
 		if (unlikely(ret)) {
-			ret = ops_sanitize_err("prep_enable", ret);
+			ret = ops_sanitize_err("init_task", ret);
 			return ret;
 		}
 	}
@@ -2356,8 +2356,8 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		 * We're either in fork or load path and @p->policy will be
 		 * applied right after. Reverting @p->policy here and rejecting
 		 * %SCHED_EXT transitions from scx_check_setscheduler()
-		 * guarantees that if ops.prep_enable() sets @p->disallow, @p
-		 * can never be in SCX.
+		 * guarantees that if ops.init_task() sets @p->disallow, @p can
+		 * never be in SCX.
 		 */
 		if (p->policy == SCHED_EXT) {
 			p->policy = SCHED_NORMAL;
@@ -2387,13 +2387,8 @@ static void scx_ops_enable_task(struct task_struct *p)
 	 * doesn't see a stale value if they inspect the task struct.
 	 */
 	set_task_scx_weight(p);
-	if (SCX_HAS_OP(enable)) {
-		struct scx_enable_args args = {
-			SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
-		};
-
-		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args);
-	}
+	if (SCX_HAS_OP(enable))
+		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
 	scx_set_task_state(p, SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(set_weight))
@@ -2412,18 +2407,16 @@ static void scx_ops_disable_task(struct task_struct *p)
 
 static void scx_ops_exit_task(struct task_struct *p)
 {
-	lockdep_assert_rq_held(task_rq(p));
+	struct scx_exit_task_args args = {
+		.cancelled = false,
+	};
 
+	lockdep_assert_rq_held(task_rq(p));
 	switch (scx_get_task_state(p)) {
 	case SCX_TASK_NONE:
 		return;
 	case SCX_TASK_INIT:
-		if (SCX_HAS_OP(cancel_enable)) {
-			struct scx_enable_args args = {
-				SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
-			};
-			SCX_CALL_OP(SCX_KF_REST, cancel_enable, p, &args);
-		}
+		args.cancelled = true;
 		break;
 	case SCX_TASK_READY:
 		break;
@@ -2432,6 +2425,8 @@ static void scx_ops_exit_task(struct task_struct *p)
 		break;
 	}
 
+	if (SCX_HAS_OP(exit_task))
+		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
 
@@ -2451,7 +2446,7 @@ int scx_fork(struct task_struct *p)
 	percpu_rwsem_assert_held(&scx_fork_rwsem);
 
 	if (scx_enabled())
-		return scx_ops_prepare_task(p, task_group(p));
+		return scx_ops_init_task(p, task_group(p));
 	else
 		return 0;
 }
@@ -3368,7 +3363,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		/*
 		 * Exit early if ops.init() triggered scx_bpf_error(). Not
 		 * strictly necessary as we'll fail transitioning into ENABLING
-		 * later but that'd be after calling ops.prep_enable() on all
+		 * later but that'd be after calling ops.init_task() on all
 		 * tasks and with -EBUSY which isn't very intuitive. Let's exit
 		 * early with success so that the condition is notified through
 		 * ops.exit() like other scx_bpf_error() invocations.
@@ -3448,13 +3443,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		get_task_struct(p);
 		spin_unlock_irq(&scx_tasks_lock);
 
-		ret = scx_ops_prepare_task(p, task_group(p));
+		ret = scx_ops_init_task(p, task_group(p));
 		if (ret) {
 			put_task_struct(p);
 			spin_lock_irq(&scx_tasks_lock);
 			scx_task_iter_exit(&sti);
 			spin_unlock_irq(&scx_tasks_lock);
-			pr_err("sched_ext: ops.prep_enable() failed (%d) for %s[%d] while loading\n",
+			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
 			goto err_disable_unlock;
 		}
@@ -3689,7 +3684,7 @@ static int bpf_scx_check_member(const struct btf_type *t,
 	u32 moff = __btf_member_bit_offset(t, member) / 8;
 
 	switch (moff) {
-	case offsetof(struct sched_ext_ops, prep_enable):
+	case offsetof(struct sched_ext_ops, init_task):
 #ifdef CONFIG_EXT_GROUP_SCHED
 	case offsetof(struct sched_ext_ops, cgroup_init):
 	case offsetof(struct sched_ext_ops, cgroup_exit):
@@ -3735,7 +3730,7 @@ static int bpf_scx_update(void *kdata, void *old_kdata)
 	 * sched_ext does not support updating the actively-loaded BPF
 	 * scheduler, as registering a BPF scheduler can always fail if the
 	 * scheduler returns an error code for e.g. ops.init(),
-	 * ops.prep_enable(), etc. Similarly, we can always race with
+	 * ops.init_task(), etc. Similarly, we can always race with
 	 * unregistration happening elsewhere, such as with sysrq.
 	 */
 	return -EOPNOTSUPP;
@@ -3948,7 +3943,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_init = {
  * @node: NUMA node to allocate from
  *
  * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(),
- * ops.prep_enable(), ops.cgroup_init() and ops.cgroup_prep_move().
+ * ops.init_task(), ops.cgroup_init() and ops.cgroup_prep_move().
  */
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 {

From aa60d9ebc020a0fbd7233227f1f502c1cc58ec8f Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 15 Dec 2023 10:53:46 -0600
Subject: [PATCH 231/304] scx: Add init_enable_count testcase

We expect to have some sched_ext_ops callbacks be called differently
depending on the scheduler, and the tasks running on the system.  Let's
add a testcase that verifies that the init_task(), exit_task(),
enable(), and disable() callbacks are all invoked correctly and as
expected.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/.gitignore        |   1 +
 tools/testing/selftests/scx/Makefile          |  27 +++--
 .../selftests/scx/init_enable_count.bpf.c     |  57 +++++++++
 .../testing/selftests/scx/init_enable_count.c | 114 ++++++++++++++++++
 .../scx/select_cpu_dfl_nodispatch.bpf.c       |   6 +-
 5 files changed, 189 insertions(+), 16 deletions(-)
 create mode 100644 tools/testing/selftests/scx/init_enable_count.bpf.c
 create mode 100644 tools/testing/selftests/scx/init_enable_count.c

diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore
index ab806b18d9dba..991721c50d9ee 100644
--- a/tools/testing/selftests/scx/.gitignore
+++ b/tools/testing/selftests/scx/.gitignore
@@ -2,6 +2,7 @@ dsp_fallbackdsq_fail
 dsp_localdsq_fail
 enq_last_no_enq_fails
 enqueue_select_cpu_fails
+init_enable_count
 minimal
 select_cpu_dfl
 select_cpu_dfl_nodispatch
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index e993335a22e0c..ae713d614f252 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -96,8 +96,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -O2 -mcpu=v3
 
 # sort removes libbpf duplicates when not cross-building
-MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf			\
-	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(OBJ_DIR)/libbpf				\
+	       $(OBJ_DIR)/bpftool $(OBJ_DIR)/resolve_btfids			\
 	       $(INCLUDE_DIR) $(SCXOBJ_DIR))
 
 $(MAKE_DIRS):
@@ -112,14 +112,14 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
 		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
 
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
-		    $(LIBBPF_OUTPUT) | $(HOST_BUILD_DIR)/bpftool
+		    $(LIBBPF_OUTPUT) | $(OBJ_DIR)/bpftool
 	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
 		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
 		    EXTRA_CFLAGS='-g -O0'					\
-		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
-		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
-		    LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/				\
-		    prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
+		    OUTPUT=$(OBJ_DIR)/bpftool/					\
+		    LIBBPF_OUTPUT=$(OBJ_DIR)/libbpf/				\
+		    LIBBPF_DESTDIR=$(OUTPUT_DIR)/				\
+		    prefix= DESTDIR=$(OUTPUT_DIR)/ install-bin
 
 $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
 ifeq ($(VMLINUX_H),)
@@ -148,16 +148,17 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 # C schedulers #
 ################
 c-sched-targets :=			\
+	dsp_fallbackdsq_fail		\
+	dsp_localdsq_fail		\
+	enq_last_no_enq_fails		\
+	enqueue_select_cpu_fails	\
+	init_enable_count		\
 	minimal				\
 	select_cpu_dfl			\
 	select_cpu_dfl_nodispatch	\
 	select_cpu_dispatch		\
-	select_cpu_dispatch_dbl_dsp	\
 	select_cpu_dispatch_bad_dsq	\
-	enqueue_select_cpu_fails	\
-	enq_last_no_enq_fails		\
-	dsp_localdsq_fail		\
-	dsp_fallbackdsq_fail
+	select_cpu_dispatch_dbl_dsp
 
 $(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
 	$(eval sched=$(notdir $@))
@@ -167,7 +168,7 @@ $(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
 TEST_GEN_PROGS := $(c-sched-targets)
 
 override define CLEAN
-	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
+	rm -rf $(OUTPUT_DIR)
 	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
 	rm -f $(TEST_GEN_PROGS)
 endef
diff --git a/tools/testing/selftests/scx/init_enable_count.bpf.c b/tools/testing/selftests/scx/init_enable_count.bpf.c
new file mode 100644
index 0000000000000..8ad8fdf4ad608
--- /dev/null
+++ b/tools/testing/selftests/scx/init_enable_count.bpf.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that verifies that we do proper counting of init, enable, etc
+ * callbacks.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt;
+volatile const bool switch_all;
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
+{
+	__sync_fetch_and_add(&init_task_cnt, 1);
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p)
+{
+	__sync_fetch_and_add(&exit_task_cnt, 1);
+}
+
+void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p)
+{
+	__sync_fetch_and_add(&enable_cnt, 1);
+}
+
+void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)
+{
+	__sync_fetch_and_add(&disable_cnt, 1);
+}
+
+s32 BPF_STRUCT_OPS(cnt_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops init_enable_count_ops = {
+	.init_task	= cnt_init_task,
+	.exit_task	= cnt_exit_task,
+	.enable		= cnt_enable,
+	.disable	= cnt_disable,
+	.init		= cnt_init,
+	.name		= "init_enable_count",
+};
diff --git a/tools/testing/selftests/scx/init_enable_count.c b/tools/testing/selftests/scx/init_enable_count.c
new file mode 100644
index 0000000000000..413bf065646eb
--- /dev/null
+++ b/tools/testing/selftests/scx/init_enable_count.c
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "scx_test.h"
+#include "init_enable_count.bpf.skel.h"
+
+#define SCHED_EXT 7
+
+static struct init_enable_count *
+open_load_prog(bool global)
+{
+	struct init_enable_count *skel;
+
+	skel = init_enable_count__open();
+	SCX_BUG_ON(!skel, "Failed to open skel");
+
+	if (global)
+		skel->rodata->switch_all = global;
+
+	SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel");
+
+	return skel;
+}
+
+static void run_test(bool global)
+{
+	struct init_enable_count *skel;
+	struct bpf_link *link;
+	const u32 num_children = 5;
+	int ret, i, status;
+	struct sched_param param = {};
+	pid_t pids[num_children];
+
+	skel = open_load_prog(global);
+	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	/* SCHED_EXT children */
+	for (i = 0; i < num_children; i++) {
+		pids[i] = fork();
+		SCX_BUG_ON(pids[i] < 0, "Failed to fork child");
+
+		if (pids[i] == 0) {
+			ret = sched_setscheduler(0, SCHED_EXT, &param);
+			SCX_BUG_ON(ret, "Failed to set sched to sched_ext");
+
+			/*
+			 * Reset to SCHED_OTHER for half of them. Counts for
+			 * everything should still be the same regardless, as
+			 * ops.disable() is invoked even if a task is still on
+			 * SCHED_EXT before it exits.
+			 */
+			if (i % 2 == 0) {
+				ret = sched_setscheduler(0, SCHED_OTHER, &param);
+				SCX_BUG_ON(ret, "Failed to reset sched to normal");
+			}
+			exit(0);
+		}
+	}
+	for (i = 0; i < num_children; i++) {
+		SCX_BUG_ON(waitpid(pids[i], &status, 0) != pids[i],
+			   "Failed to wait for SCX child");
+		SCX_BUG_ON(status != 0, "SCX child %d exited with status %d",
+			   i, status);
+	}
+
+	/* SCHED_OTHER children */
+	for (i = 0; i < num_children; i++) {
+		pids[i] = fork();
+		if (pids[i] == 0)
+			exit(0);
+	}
+	for (i = 0; i < num_children; i++) {
+		SCX_BUG_ON(waitpid(pids[i], &status, 0) != pids[i],
+			   "Failed to wait for normal child");
+		SCX_BUG_ON(status != 0,
+			   "Normal child %d exited with status %d", i, status);
+	}
+
+	sleep(1);
+
+	SCX_GE(skel->bss->init_task_cnt, 2 * num_children);
+	SCX_GE(skel->bss->exit_task_cnt, 2 * num_children);
+
+	if (global) {
+		SCX_GE(skel->bss->enable_cnt, 2 * num_children);
+		SCX_GE(skel->bss->disable_cnt, 2 * num_children);
+	} else {
+		SCX_EQ(skel->bss->enable_cnt, num_children);
+		SCX_EQ(skel->bss->disable_cnt, num_children);
+	}
+
+	bpf_link__destroy(link);
+	init_enable_count__destroy(skel);
+}
+
+int main(int argc, char **argv)
+{
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	run_test(true);
+	run_test(false);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c
index 9d026e0cbdbb4..636ea1de12fe0 100644
--- a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c
+++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c
@@ -70,8 +70,8 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p,
 	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
 }
 
-s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_prep_enable,
-		   struct task_struct *p, struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
+		   struct task_struct *p, struct scx_init_task_args *args)
 {
 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -91,7 +91,7 @@ SEC(".struct_ops.link")
 struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
 	.select_cpu		= select_cpu_dfl_nodispatch_select_cpu,
 	.enqueue		= select_cpu_dfl_nodispatch_enqueue,
-	.prep_enable		= select_cpu_dfl_nodispatch_prep_enable,
+	.init_task		= select_cpu_dfl_nodispatch_init_task,
 	.init			= select_cpu_dfl_nodispatch_init,
 	.name			= "select_cpu_dfl_nodispatch",
 };

From 6b8ccfd2b2651646b1de94ede151ed436ab826d4 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 4 Jan 2024 22:14:09 -0600
Subject: [PATCH 232/304] scx: Move sched_ext_entity.ddsq_id out of modifiable
 fields

When we added support for dispatching from ops.select_cpu(), I
accidentally put the sched_ext_entity.ddsq_id field into the "modifiable
fields" part of struct sched_ext_entity. It should be harmless, but
there shouldn't be any reason for a scheduler to muck with it, so let's
move it up.

Signed-off-by: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 73bcb9292954f..e5e69587be6fe 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -693,6 +693,7 @@ struct sched_ext_entity {
 #ifdef CONFIG_SCHED_CORE
 	u64			core_sched_at;	/* see scx_prio_less() */
 #endif
+	u64			ddsq_id;
 
 	/* BPF scheduler modifiable fields */
 
@@ -717,12 +718,6 @@ struct sched_ext_entity {
 	 */
 	u64			dsq_vtime;
 
-	/*
-	 * Used to track when a task has requested a direct dispatch from the
-	 * ops.select_cpu() path.
-	 */
-	u64			ddsq_id;
-
 	/*
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.

From 367eab28387c75fe6873fb094ef90a2dee497e4a Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 5 Jan 2024 10:37:13 -0600
Subject: [PATCH 233/304] scx: Add missing DSQ fallback test files

I forgot to include these in the patch set that fixes and tests us
gracefully falling back to the global DSQ.

Signed-off-by: David Vernet <void@manifault.com>
---
 .../selftests/scx/dsp_localdsq_fail.bpf.c     | 39 +++++++++++++++++++
 .../testing/selftests/scx/dsp_localdsq_fail.c | 36 +++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c
 create mode 100644 tools/testing/selftests/scx/dsp_localdsq_fail.c

diff --git a/tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c b/tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c
new file mode 100644
index 0000000000000..e27a95a8726be
--- /dev/null
+++ b/tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+s32 BPF_STRUCT_OPS(dsp_localdsq_fail_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+
+	if (cpu >= 0) {
+		/* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */
+		scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+				       p->scx.dsq_vtime, 0);
+		return cpu;
+	}
+
+	return prev_cpu;
+}
+
+s32 BPF_STRUCT_OPS(dsp_localdsq_fail_init)
+{
+	scx_bpf_switch_all();
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dsp_localdsq_fail_ops = {
+	.select_cpu		= dsp_localdsq_fail_select_cpu,
+	.init			= dsp_localdsq_fail_init,
+	.name			= "dsp_localdsq_fail",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/dsp_localdsq_fail.c b/tools/testing/selftests/scx/dsp_localdsq_fail.c
new file mode 100644
index 0000000000000..4840386ba7643
--- /dev/null
+++ b/tools/testing/selftests/scx/dsp_localdsq_fail.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "dsp_localdsq_fail.bpf.skel.h"
+#include "scx_test.h"
+
+int main(int argc, char **argv)
+{
+	struct dsp_localdsq_fail *skel;
+	struct bpf_link *link;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = dsp_localdsq_fail__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	link = bpf_map__attach_struct_ops(skel->maps.dsp_localdsq_fail_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+
+	bpf_link__destroy(link);
+	dsp_localdsq_fail__destroy(skel);
+
+	return 0;
+}

From 2cf297c2d74ca804635a36f9d63ae78874373bdb Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 5 Jan 2024 13:33:48 -0600
Subject: [PATCH 234/304] scx: Claim idle core in scx_select_cpu_dfl for
 nr_cpus_allowed ==1

In scx_select_cpu_dfl(), we're currently returning prev_cpu if
p->nr_cpus_allowed == 1. It makes sense to return prev_cpu if the task
can't run on any other cores, but we might as well also try to claim the
core as idle so that:

1. scx_select_cpu_dfl() will directly dispatch it
2. To prevent another core from incorrectly assuming that core will be
   idle when in reality that task will be enqueued to it. The mask will
   eventually be updated in __scx_update_idle(), but this seems more
   efficient.
3. To have the idle cpumask bit be unset when the task is enqueued in
   ops.enqueue() (if the core scheduler is using
   scx_bpf_select_cpu_dfl()).

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index cace32cbd8379..fe25df8022cb0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2048,8 +2048,14 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 			goto cpu_found;
 	}
 
-	if (p->nr_cpus_allowed == 1)
-		return prev_cpu;
+	if (p->nr_cpus_allowed == 1) {
+		if (test_and_clear_cpu_idle(prev_cpu)) {
+			cpu = prev_cpu;
+			goto cpu_found;
+		} else {
+			return prev_cpu;
+		}
+	}
 
 	/*
 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than

From e6cb89231255d9b86a088a8f5ec5c0df3dbe73e7 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 5 Jan 2024 17:03:59 -0600
Subject: [PATCH 235/304] scx: Make select_cpu_dfl test a bit less brittle

select_cpu_dfl checks whether a task that's successfully dispatched from
the default select_cpu implementation isn't subsequently enqueued. It's
only doing the check for non-pcpu threads, but that's not really the
condition we want to look for. We don't want to do the check for any
task that's being enqueued on the enable path, because it won't have
gone through the select_cpu path. Instead, let's just check the task
name to verify it's the test task.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/select_cpu_dfl.bpf.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/scx/select_cpu_dfl.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c
index 091bf1ed9bec0..f2fa80628299b 100644
--- a/tools/testing/selftests/scx/select_cpu_dfl.bpf.c
+++ b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c
@@ -14,14 +14,20 @@ char _license[] SEC("license") = "GPL";
 
 bool saw_local = false;
 
+static bool task_is_test(const struct task_struct *p)
+{
+	return !bpf_strncmp(p->comm, 9, "select_cpu");
+}
+
 void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
 		    u64 enq_flags)
 {
 	const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask();
 
-	if (p->nr_cpus_allowed > 1 &&
-	    bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask))
+	if (task_is_test(p) &&
+	    bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) {
 		saw_local = true;
+	}
 	scx_bpf_put_idle_cpumask(idle_mask);
 
 	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);

From c3c7041b04c7fe1e80f7a558fed494f82f204618 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 8 Jan 2024 09:54:08 -0600
Subject: [PATCH 236/304] scx: Avoid possible deadlock with cpus_read_lock()

Han Xing Yi reported a syzbot lockdep error over the weekend:

======================================================
WARNING: possible circular locking dependency detected
6.6.0-g2f6ba98e2d3d #4 Not tainted
------------------------------------------------------
syz-executor.0/2181 is trying to acquire lock:
ffffffff84772410 (pernet_ops_rwsem){++++}-{3:3}, at: copy_net_ns+0x216/0x590 net/core/net_namespace.c:487
but task is already holding lock:
ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (scx_fork_rwsem){++++}-{0:0}:
       percpu_down_write+0x51/0x210 kernel/locking/percpu-rwsem.c:227
       scx_ops_enable+0x230/0xf90 kernel/sched/ext.c:3271
       bpf_struct_ops_link_create+0x1b9/0x220 kernel/bpf/bpf_struct_ops.c:914
       link_create kernel/bpf/syscall.c:4938 [inline]
       __sys_bpf+0x35af/0x4ac0 kernel/bpf/syscall.c:5453
       __do_sys_bpf kernel/bpf/syscall.c:5487 [inline]
       __se_sys_bpf kernel/bpf/syscall.c:5485 [inline]
       __x64_sys_bpf+0x48/0x60 kernel/bpf/syscall.c:5485
       do_syscall_x64 arch/x86/entry/common.c:51 [inline]
       do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82
       entry_SYSCALL_64_after_hwframe+0x6e/0x76
-> #2 (cpu_hotplug_lock){++++}-{0:0}:
       percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
       cpus_read_lock+0x42/0x1b0 kernel/cpu.c:489
       flush_all_backlogs net/core/dev.c:5885 [inline]
       unregister_netdevice_many_notify+0x30a/0x1070 net/core/dev.c:10965
       unregister_netdevice_many+0x19/0x20 net/core/dev.c:11039
       sit_exit_batch_net+0x433/0x460 net/ipv6/sit.c:1887
       ops_exit_list+0xc5/0xe0 net/core/net_namespace.c:175
       cleanup_net+0x3e2/0x750 net/core/net_namespace.c:614
       process_one_work+0x50d/0xc20 kernel/workqueue.c:2630
       process_scheduled_works kernel/workqueue.c:2703 [inline]
       worker_thread+0x50b/0x950 kernel/workqueue.c:2784
       kthread+0x1fa/0x250 kernel/kthread.c:388
       ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147
       ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242
-> #1 (rtnl_mutex){+.+.}-{3:3}:
       __mutex_lock_common kernel/locking/mutex.c:603 [inline]
       __mutex_lock+0xc1/0xea0 kernel/locking/mutex.c:747
       mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:799
       rtnl_lock+0x17/0x20 net/core/rtnetlink.c:79
       register_netdevice_notifier+0x25/0x1c0 net/core/dev.c:1741
       rtnetlink_init+0x3a/0x6e0 net/core/rtnetlink.c:6657
       netlink_proto_init+0x23d/0x2f0 net/netlink/af_netlink.c:2946
       do_one_initcall+0xb3/0x5f0 init/main.c:1232
       do_initcall_level init/main.c:1294 [inline]
       do_initcalls init/main.c:1310 [inline]
       do_basic_setup init/main.c:1329 [inline]
       kernel_init_freeable+0x40c/0x5d0 init/main.c:1547
       kernel_init+0x1d/0x350 init/main.c:1437
       ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147
       ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242
-> #0 (pernet_ops_rwsem){++++}-{3:3}:
       check_prev_add kernel/locking/lockdep.c:3134 [inline]
       check_prevs_add kernel/locking/lockdep.c:3253 [inline]
       validate_chain kernel/locking/lockdep.c:3868 [inline]
       __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136
       lock_acquire kernel/locking/lockdep.c:5753 [inline]
       lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718
       down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549
       copy_net_ns+0x216/0x590 net/core/net_namespace.c:487
       create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110
       copy_namespaces+0x488/0x540 kernel/nsproxy.c:179
       copy_process+0x1b52/0x4680 kernel/fork.c:2504
       kernel_clone+0x116/0x660 kernel/fork.c:2914
       __do_sys_clone3+0x192/0x220 kernel/fork.c:3215
       __se_sys_clone3 kernel/fork.c:3199 [inline]
       __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199
       do_syscall_x64 arch/x86/entry/common.c:51 [inline]
       do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82
       entry_SYSCALL_64_after_hwframe+0x6e/0x76
other info that might help us debug this:
Chain exists of:
  pernet_ops_rwsem --> cpu_hotplug_lock --> scx_fork_rwsem
 Possible unsafe locking scenario:
       CPU0                    CPU1
       ----                    ----
  rlock(scx_fork_rwsem);
                               lock(cpu_hotplug_lock);
                               lock(scx_fork_rwsem);
  rlock(pernet_ops_rwsem);
 *** DEADLOCK ***
1 lock held by syz-executor.0/2181:
 #0: ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810
stack backtrace:
CPU: 0 PID: 2181 Comm: syz-executor.0 Not tainted 6.6.0-g2f6ba98e2d3d #4
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
Sched_ext: serialise (enabled), task: runnable_at=-6ms
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:89 [inline]
 dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:107
 dump_stack+0x15/0x20 lib/dump_stack.c:114
 check_noncircular+0x134/0x150 kernel/locking/lockdep.c:2187
 check_prev_add kernel/locking/lockdep.c:3134 [inline]
 check_prevs_add kernel/locking/lockdep.c:3253 [inline]
 validate_chain kernel/locking/lockdep.c:3868 [inline]
 __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136
 lock_acquire kernel/locking/lockdep.c:5753 [inline]
 lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718
 down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549
 copy_net_ns+0x216/0x590 net/core/net_namespace.c:487
 create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110
 copy_namespaces+0x488/0x540 kernel/nsproxy.c:179
 copy_process+0x1b52/0x4680 kernel/fork.c:2504
 kernel_clone+0x116/0x660 kernel/fork.c:2914
 __do_sys_clone3+0x192/0x220 kernel/fork.c:3215
 __se_sys_clone3 kernel/fork.c:3199 [inline]
 __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x7f9f764e240d
Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f9f75851ee8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b3
RAX: ffffffffffffffda RBX: 00007f9f7661ef80 RCX: 00007f9f764e240d
RDX: 0000000000000100 RSI: 0000000000000058 RDI: 00007f9f75851f00
RBP: 00007f9f765434a6 R08: 0000000000000000 R09: 0000000000000058
R10: 00007f9f75851f00 R11: 0000000000000246 R12: 0000000000000058
R13: 0000000000000006 R14: 00007f9f7661ef80 R15: 00007f9f75832000
 </TASK>

The issue is that we're acquiring the cpus_read_lock() _before_ we
acquire scx_fork_rwsem in scx_ops_enable() and scx_ops_disable(), but we
acquire and hold scx_fork_rwsem around basically the whole fork() path.
I don't see how a deadlock could actually occur in practice, but it
should be safe to acquire the scx_fork_rwsem and scx_cgroup_rwsem
semaphores before the hotplug lock, so let's do that.

Reported-by: Han Xing Yi <hxingyi104@gmail.com>
Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fe25df8022cb0..e9a01e4ba1832 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3155,9 +3155,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	WRITE_ONCE(scx_switching_all, false);
 
 	/* avoid racing against fork and cgroup changes */
-	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
+	cpus_read_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -3196,9 +3196,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	scx_cgroup_exit();
 
+	cpus_read_unlock();
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
-	cpus_read_unlock();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3353,9 +3353,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	atomic_long_set(&scx_nr_rejected, 0);
 
 	/*
-	 * Keep CPUs stable during enable so that the BPF scheduler can track
-	 * online CPUs by watching ->on/offline_cpu() after ->init().
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
+	 *
+	 * Also keep CPUs stable during enable so that the BPF scheduler can
+	 * track online CPUs by watching ->on/offline_cpu() after ->init().
+	 *
+	 * Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
+	 * cpus_read_lock() is acquired in a ton of places, so let's be a bit
+	 * cautious to avoid possible deadlock.
 	 */
+	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
 	cpus_read_lock();
 
 	scx_switch_all_req = false;
@@ -3399,13 +3408,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
 			   scx_watchdog_timeout / 2);
 
-	/*
-	 * Lock out forks, cgroup on/offlining and moves before opening the
-	 * floodgate so that they don't wander into the operations prematurely.
-	 */
-	percpu_down_write(&scx_fork_rwsem);
-	scx_cgroup_lock();
-
 	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -3431,7 +3433,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	ret = scx_cgroup_init();
 	if (ret)
-		goto err_disable_unlock;
+		goto err_disable;
 
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
@@ -3457,7 +3459,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			spin_unlock_irq(&scx_tasks_lock);
 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
-			goto err_disable_unlock;
+			goto err_disable;
 		}
 
 		put_task_struct(p);
@@ -3481,7 +3483,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		preempt_enable();
 		spin_unlock_irq(&scx_tasks_lock);
 		ret = -EBUSY;
-		goto err_disable_unlock;
+		goto err_disable;
 	}
 
 	/*
@@ -3515,8 +3517,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;
@@ -3527,6 +3527,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		static_branch_enable_cpuslocked(&__scx_switched_all);
 
 	cpus_read_unlock();
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
@@ -3537,11 +3539,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
-err_disable_unlock:
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 err_disable:
 	cpus_read_unlock();
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 	mutex_unlock(&scx_ops_enable_mutex);
 	/* must be fully disabled before returning */
 	scx_ops_disable(SCX_EXIT_ERROR);

From 8bbe0dbd8a779efc28a08418ae79a44319a5e024 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 8 Jan 2024 11:14:41 -0600
Subject: [PATCH 237/304] scx: Set default slice for default select_cpu
 dispatch

If ops.select_cpu() isn't defined, scx_select_cpu_dfl() will be called,
and a task will be dispatched directly to a core if one is found. I
neglected to also set the task slice, so we see the following warning if
we use the direct dispatch:

[root@arch scx]# ./select_cpu_dfl
[   23.184426] sched_ext: select_cpu_dfl[356] has zero slice in pick_next_task_scx()

I'm not sure why this wasn't being printed when I tested this before,
but let's fix it.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fe25df8022cb0..a3caebb42f354 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2124,8 +2124,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		s32 cpu;
 
 		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
-		if (found)
+		if (found) {
+			p->scx.slice = SCX_SLICE_DFL;
 			p->scx.ddsq_id = SCX_DSQ_LOCAL;
+		}
 		return cpu;
 	}
 }

From 4164e16dbce462786e6568fd3fa29090052423ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 31 Dec 2023 06:51:41 +0900
Subject: [PATCH 238/304] scx: Use READ/WRITE_ONCE() for
 scx_watchdog_timeout/timestamp

They're accessed without any locking and check_rq_for_timeouts() seems to
assume that last_runnable doesn't get fetched multipled times which isn't
true without READ_ONCE().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 11 +++++++----
 kernel/sched/ext.h |  5 +++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 631e5dfb7b701..096a025da955c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2244,7 +2244,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
 {
 	int cpu;
 
-	scx_watchdog_timestamp = jiffies;
+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
 
 	for_each_online_cpu(cpu) {
 		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
@@ -3321,6 +3321,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 {
 	struct scx_task_iter sti;
 	struct task_struct *p;
+	unsigned long timeout;
 	int i, ret;
 
 	mutex_lock(&scx_ops_enable_mutex);
@@ -3402,11 +3403,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable;
 	}
 
-	scx_watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
 	if (ops->timeout_ms)
-		scx_watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+		timeout = msecs_to_jiffies(ops->timeout_ms);
+	else
+		timeout = SCX_WATCHDOG_MAX_TIMEOUT;
 
-	scx_watchdog_timestamp = jiffies;
+	WRITE_ONCE(scx_watchdog_timeout, timeout);
+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
 			   scx_watchdog_timeout / 2);
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index a8f72efe39b36..3055efbfaf526 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -163,8 +163,9 @@ static inline void scx_notify_sched_tick(void)
 	if (!scx_enabled())
 		return;
 
-	last_check = scx_watchdog_timestamp;
-	if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) {
+	last_check = READ_ONCE(scx_watchdog_timestamp);
+	if (unlikely(time_after(jiffies,
+				last_check + READ_ONCE(scx_watchdog_timeout)))) {
 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
 
 		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,

From 9c0a7992d1fd2986c4919979e1a3064787c31490 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:22 -1000
Subject: [PATCH 239/304] scx: Rename rq->scx.watchdog_list and friends to
 runnable_list and counterparts

The list will be used for another purpose too. Rename to indicate the
generic nature.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h |  6 ++++--
 init/init_task.c          |  2 +-
 kernel/sched/core.c       |  2 +-
 kernel/sched/ext.c        | 41 ++++++++++++++++++++-------------------
 kernel/sched/sched.h      |  2 +-
 5 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index e5e69587be6fe..90383453d65d1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -616,7 +616,7 @@ enum scx_ent_flags {
 	SCX_TASK_STATE_0	= 1 << 3, /* first bit encoding the task's current state */
 	SCX_TASK_STATE_1	= 1 << 4, /* second bit encoding the task's current state */
 
-	SCX_TASK_WATCHDOG_RESET = 1 << 16, /* task watchdog counter should be reset */
+	SCX_TASK_RESET_RUNNABLE_AT = 1 << 16, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 17, /* last dequeue was for SLEEP */
 
 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
@@ -680,7 +680,6 @@ struct sched_ext_entity {
 		struct list_head	fifo;	/* dispatch order */
 		struct rb_node		priq;	/* p->scx.dsq_vtime order */
 	} dsq_node;
-	struct list_head	watchdog_node;
 	u32			flags;		/* protected by rq lock */
 	u32			dsq_flags;	/* protected by dsq lock */
 	u32			weight;
@@ -689,7 +688,10 @@ struct sched_ext_entity {
 	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 	atomic_long_t		ops_state;
+
+	struct list_head	runnable_node;	/* rq->scx.runnable_list */
 	unsigned long		runnable_at;
+
 #ifdef CONFIG_SCHED_CORE
 	u64			core_sched_at;	/* see scx_prio_less() */
 #endif
diff --git a/init/init_task.c b/init/init_task.c
index c19041286b76c..fbada7fbd4701 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -107,7 +107,7 @@ struct task_struct init_task
 #ifdef CONFIG_SCHED_CLASS_EXT
 	.scx		= {
 		.dsq_node.fifo	= LIST_HEAD_INIT(init_task.scx.dsq_node.fifo),
-		.watchdog_node	= LIST_HEAD_INIT(init_task.scx.watchdog_node),
+		.runnable_node	= LIST_HEAD_INIT(init_task.scx.runnable_node),
 		.flags		= 0,
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c8885037f2a30..258d9320ef480 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4555,7 +4555,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->scx.dsq		= NULL;
 	INIT_LIST_HEAD(&p->scx.dsq_node.fifo);
 	RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-	INIT_LIST_HEAD(&p->scx.watchdog_node);
+	INIT_LIST_HEAD(&p->scx.runnable_node);
 	p->scx.flags		= 0;
 	p->scx.weight		= 0;
 	p->scx.sticky_cpu	= -1;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 096a025da955c..21906020db510 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -946,25 +946,25 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
 }
 
-static bool watchdog_task_watched(const struct task_struct *p)
+static bool task_runnable(const struct task_struct *p)
 {
-	return !list_empty(&p->scx.watchdog_node);
+	return !list_empty(&p->scx.runnable_node);
 }
 
-static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+static void set_task_runnable(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
-	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
+	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT)
 		p->scx.runnable_at = jiffies;
-	p->scx.flags &= ~SCX_TASK_WATCHDOG_RESET;
-	list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
+	p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
 }
 
-static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
 {
-	list_del_init(&p->scx.watchdog_node);
-	if (reset_timeout)
-		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+	list_del_init(&p->scx.runnable_node);
+	if (reset_runnable_at)
+		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 }
 
 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
@@ -986,11 +986,11 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 		sticky_cpu = cpu_of(rq);
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
-		WARN_ON_ONCE(!watchdog_task_watched(p));
+		WARN_ON_ONCE(!task_runnable(p));
 		return;
 	}
 
-	watchdog_watch_task(rq, p);
+	set_task_runnable(rq, p);
 	p->scx.flags |= SCX_TASK_QUEUED;
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
@@ -1008,7 +1008,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 {
 	unsigned long opss;
 
-	watchdog_unwatch_task(p, false);
+	/* dequeue is always temporary, don't reset runnable_at */
+	clr_task_runnable(p, false);
 
 	/* acquire ensures that we see the preceding updates on QUEUED */
 	opss = atomic_long_read_acquire(&p->scx.ops_state);
@@ -1055,7 +1056,7 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	struct scx_rq *scx_rq = &rq->scx;
 
 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
-		WARN_ON_ONCE(watchdog_task_watched(p));
+		WARN_ON_ONCE(task_runnable(p));
 		return;
 	}
 
@@ -1710,7 +1711,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
 		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
 
-	watchdog_unwatch_task(p, true);
+	clr_task_runnable(p, true);
 
 	/*
 	 * @p is getting newly scheduled or got kicked after someone updated its
@@ -1772,13 +1773,13 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 	 */
 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
-		watchdog_watch_task(rq, p);
+		set_task_runnable(rq, p);
 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
 		return;
 	}
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
-		watchdog_watch_task(rq, p);
+		set_task_runnable(rq, p);
 
 		/*
 		 * If @p has slice left and balance_scx() didn't tag it for
@@ -2220,7 +2221,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
 	bool timed_out = false;
 
 	rq_lock_irqsave(rq, &rf);
-	list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) {
+	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
 		unsigned long last_runnable = p->scx.runnable_at;
 
 		if (unlikely(time_after(jiffies,
@@ -2375,7 +2376,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg)
 		task_rq_unlock(rq, p, &rf);
 	}
 
-	p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 	return 0;
 }
 
@@ -3902,7 +3903,7 @@ void __init init_sched_ext_class(void)
 		struct rq *rq = cpu_rq(cpu);
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
-		INIT_LIST_HEAD(&rq->scx.watchdog_list);
+		INIT_LIST_HEAD(&rq->scx.runnable_list);
 
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32eddb62a96dc..ec254f5e320b6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -690,7 +690,7 @@ enum scx_rq_flags {
 
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
-	struct list_head	watchdog_list;
+	struct list_head	runnable_list;		/* runnable tasks on this rq */
 	unsigned long		ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;

From 215f0ff5cee341c405fd1765a0f2da230560def5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 240/304] scx: Factor out scx_ops_bypass() and
 s/scx_ops_disabling()/scx_ops_bypassing()/g

Guaranteeing forward progress by forcing global FIFO behavior is currently
used only in the disabling path. This will be used for something else too.
Let's factor it out and rename accordingly.

No functional change intended.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 119 +++++++++++++++++++++++----------------------
 1 file changed, 62 insertions(+), 57 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 21906020db510..a309d192e70d6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -495,7 +495,7 @@ static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
 	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
 }
 
-static bool scx_ops_disabling(void)
+static bool scx_ops_bypassing(void)
 {
 	return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING);
 }
@@ -1594,7 +1594,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
 		 * same conditions later and pick @rq->curr accordingly.
 		 */
 		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
-		    prev->scx.slice && !scx_ops_disabling()) {
+		    prev->scx.slice && !scx_ops_bypassing()) {
 			if (local)
 				prev->scx.flags |= SCX_TASK_BAL_KEEP;
 			return 1;
@@ -1787,7 +1787,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 		 * scheduler class or core-sched forcing a different task. Leave
 		 * it at the head of the local DSQ.
 		 */
-		if (p->scx.slice && !scx_ops_disabling()) {
+		if (p->scx.slice && !scx_ops_bypassing()) {
 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
 			return;
 		}
@@ -1830,7 +1830,7 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
 		return NULL;
 
 	if (unlikely(!p->scx.slice)) {
-		if (!scx_ops_disabling() && !scx_warned_zero_slice) {
+		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
 			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
 					p->comm, p->pid);
 			scx_warned_zero_slice = true;
@@ -1869,7 +1869,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	 * calling ops.core_sched_before(). Accesses are controlled by the
 	 * verifier.
 	 */
-	if (SCX_HAS_OP(core_sched_before) && !scx_ops_disabling())
+	if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
 		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);
@@ -2265,7 +2265,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 	 * While disabling, always resched and refresh core-sched timestamp as
 	 * we can't trust the slice management or ops.core_sched_before().
 	 */
-	if (scx_ops_disabling()) {
+	if (scx_ops_bypassing()) {
 		curr->scx.slice = 0;
 		touch_core_sched(rq, curr);
 	}
@@ -2568,7 +2568,7 @@ bool scx_can_stop_tick(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
 
-	if (scx_ops_disabling())
+	if (scx_ops_bypassing())
 		return false;
 
 	if (p->sched_class != &ext_sched_class)
@@ -3017,7 +3017,7 @@ static void scx_cgroup_config_knobs(void) {}
  */
 bool task_should_scx(struct task_struct *p)
 {
-	if (!scx_enabled() || scx_ops_disabling())
+	if (!scx_enabled() || scx_ops_bypassing())
 		return false;
 	if (READ_ONCE(scx_switching_all))
 		return true;
@@ -3034,6 +3034,57 @@ static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
 
 static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
 
+/**
+ * scx_ops_bypass - Bypass scx_ops and guarantee forward progress
+ *
+ * We must guarantee that all runnable tasks make forward progress without
+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
+ * be held by tasks that the BPF scheduler is forgetting to run, which
+ * unfortunately also excludes toggling the static branches.
+ *
+ * Let's work around by overriding a couple ops and modifying behaviors based on
+ * the DISABLING state and then cycling the tasks through dequeue/enqueue to
+ * force global FIFO scheduling.
+ *
+ * a. ops.enqueue() and .dispatch() are overridden for simple global FIFO
+ *    scheduling.
+ *
+ * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
+ *    trusted. Whenever a tick triggers, the running task is rotated to the tail
+ *    of the queue with core_sched_at touched.
+ *
+ * c. pick_next_task() suppresses zero slice warning.
+ *
+ * d. scx_prio_less() reverts to the default core_sched_at order.
+ */
+static void scx_ops_bypass(void)
+{
+	struct scx_task_iter sti;
+	struct task_struct *p;
+	int cpu;
+
+	scx_ops.enqueue = scx_ops_fallback_enqueue;
+	scx_ops.dispatch = scx_ops_fallback_dispatch;
+
+	spin_lock_irq(&scx_tasks_lock);
+	scx_task_iter_init(&sti);
+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
+		if (READ_ONCE(p->__state) != TASK_DEAD) {
+			struct sched_enq_and_set_ctx ctx;
+
+			/* cycling deq/enq is enough, see above */
+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+			sched_enq_and_set_task(&ctx);
+		}
+	}
+	scx_task_iter_exit(&sti);
+	spin_unlock_irq(&scx_tasks_lock);
+
+	/* kick all CPUs to restore ticks */
+	for_each_possible_cpu(cpu)
+		resched_cpu(cpu);
+}
+
 static void scx_ops_disable_workfn(struct kthread_work *work)
 {
 	struct scx_exit_info *ei = &scx_exit_info;
@@ -3042,7 +3093,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
 	const char *reason;
-	int i, cpu, kind;
+	int i, kind;
 
 	kind = atomic_read(&scx_exit_kind);
 	while (true) {
@@ -3090,63 +3141,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 			     SCX_OPS_DISABLING);
 		return;
 	case SCX_OPS_PREPPING:
-		goto forward_progress_guaranteed;
+		break;
 	case SCX_OPS_DISABLING:
 		/* shouldn't happen but handle it like ENABLING if it does */
 		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
 		fallthrough;
 	case SCX_OPS_ENABLING:
 	case SCX_OPS_ENABLED:
+		scx_ops_bypass();
 		break;
 	}
 
-	/*
-	 * DISABLING is set and ops was either ENABLING or ENABLED indicating
-	 * that the ops and static branches are set.
-	 *
-	 * We must guarantee that all runnable tasks make forward progress
-	 * without trusting the BPF scheduler. We can't grab any mutexes or
-	 * rwsems as they might be held by tasks that the BPF scheduler is
-	 * forgetting to run, which unfortunately also excludes toggling the
-	 * static branches.
-	 *
-	 * Let's work around by overriding a couple ops and modifying behaviors
-	 * based on the DISABLING state and then cycling the tasks through
-	 * dequeue/enqueue to force global FIFO scheduling.
-	 *
-	 * a. ops.enqueue() and .dispatch() are overridden for simple global
-	 *    FIFO scheduling.
-	 *
-	 * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value
-	 *    can't be trusted. Whenever a tick triggers, the running task is
-	 *    rotated to the tail of the queue with core_sched_at touched.
-	 *
-	 * c. pick_next_task() suppresses zero slice warning.
-	 *
-	 * d. scx_prio_less() reverts to the default core_sched_at order.
-	 */
-	scx_ops.enqueue = scx_ops_fallback_enqueue;
-	scx_ops.dispatch = scx_ops_fallback_dispatch;
-
-	spin_lock_irq(&scx_tasks_lock);
-	scx_task_iter_init(&sti);
-	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
-		if (READ_ONCE(p->__state) != TASK_DEAD) {
-			struct sched_enq_and_set_ctx ctx;
-
-			/* cycling deq/enq is enough, see above */
-			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-			sched_enq_and_set_task(&ctx);
-		}
-	}
-	scx_task_iter_exit(&sti);
-	spin_unlock_irq(&scx_tasks_lock);
-
-	/* kick all CPUs to restore ticks */
-	for_each_possible_cpu(cpu)
-		resched_cpu(cpu);
-
-forward_progress_guaranteed:
 	/*
 	 * Here, every runnable task is guaranteed to make forward progress and
 	 * we can safely use blocking synchronization constructs. Actually

From f4c4ef2203bf6953a4787e447a64990113c7b183 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 241/304] scx: Implement bypass depth and always bypass while
 disabling

Implement bypass depth so that multiple users can request bypassing without
conflicts. This decouples bypass on/off from ops state so that bypassing can
be used in combination with any ops state. The unbypassing path isn't used
yet and is to be implemented.

Note that task_should_scx() needs to test whether DISABLING rather than
bypassing and thus updated to test scx_ops_enable_state() explicitly.

The disable path now always uses bypassing to balance bypass depth. This
also leads to simpler code.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 50 +++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a309d192e70d6..1256778a660b9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -91,6 +91,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
 static bool scx_switch_all_req;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -497,7 +498,7 @@ static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
 
 static bool scx_ops_bypassing(void)
 {
-	return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING);
+	return unlikely(atomic_read(&scx_ops_bypass_depth));
 }
 
 /**
@@ -3017,7 +3018,8 @@ static void scx_cgroup_config_knobs(void) {}
  */
 bool task_should_scx(struct task_struct *p)
 {
-	if (!scx_enabled() || scx_ops_bypassing())
+	if (!scx_enabled() ||
+	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
 		return false;
 	if (READ_ONCE(scx_switching_all))
 		return true;
@@ -3035,9 +3037,9 @@ static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
 static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
 
 /**
- * scx_ops_bypass - Bypass scx_ops and guarantee forward progress
+ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
  *
- * We must guarantee that all runnable tasks make forward progress without
+ * Bypassing guarantees that all runnable tasks make forward progress without
  * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
  * be held by tasks that the BPF scheduler is forgetting to run, which
  * unfortunately also excludes toggling the static branches.
@@ -3057,14 +3059,26 @@ static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
  *
  * d. scx_prio_less() reverts to the default core_sched_at order.
  */
-static void scx_ops_bypass(void)
+static void scx_ops_bypass(bool bypass)
 {
 	struct scx_task_iter sti;
 	struct task_struct *p;
-	int cpu;
+	int depth, cpu;
+
+	if (bypass) {
+		depth = atomic_inc_return(&scx_ops_bypass_depth);
+		WARN_ON_ONCE(depth <= 0);
+		if (depth != 1)
+			return;
 
-	scx_ops.enqueue = scx_ops_fallback_enqueue;
-	scx_ops.dispatch = scx_ops_fallback_dispatch;
+		scx_ops.enqueue = scx_ops_fallback_enqueue;
+		scx_ops.dispatch = scx_ops_fallback_dispatch;
+	} else {
+		depth = atomic_dec_return(&scx_ops_bypass_depth);
+		WARN_ON_ONCE(depth < 0);
+		if (depth != 0)
+			return;
+	}
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -3133,22 +3147,20 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	ei->kind = kind;
 	strlcpy(ei->reason, reason, sizeof(ei->reason));
 
+	/* guarantee forward progress by bypassing scx_ops */
+	scx_ops_bypass(true);
+
 	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
+	case SCX_OPS_DISABLING:
+		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
+		break;
 	case SCX_OPS_DISABLED:
 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
 			scx_exit_info.msg);
 		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
 			     SCX_OPS_DISABLING);
-		return;
-	case SCX_OPS_PREPPING:
-		break;
-	case SCX_OPS_DISABLING:
-		/* shouldn't happen but handle it like ENABLING if it does */
-		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
-		fallthrough;
-	case SCX_OPS_ENABLING:
-	case SCX_OPS_ENABLED:
-		scx_ops_bypass();
+		goto done;
+	default:
 		break;
 	}
 
@@ -3245,6 +3257,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		     SCX_OPS_DISABLING);
 
 	scx_cgroup_config_knobs();
+done:
+	scx_ops_bypass(false);
 }
 
 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);

From a00ac85c441c708c6072ebb75d1f58179fdd70c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 242/304] scx: Implement turning off bypassing

Bypassing overrides ops.enqueue() and .dispatch() to force global FIFO
behavior. However, this was an irreversible action making it impossible to
turn off bypassing. Instead, add behaviors conditional on
scx_ops_bypassing() to implement global FIFO behavior while bypassing. This
adds two condition checks to hot paths but they're easily predictable and
shouldn't add noticeable overhead.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1256778a660b9..c7e00baec5194 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -889,6 +889,13 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (unlikely(!test_rq_online(rq)))
 		goto local;
 
+	if (scx_ops_bypassing()) {
+		if (enq_flags & SCX_ENQ_LAST)
+			goto local;
+		else
+			goto global;
+	}
+
 	/* see %SCX_OPS_ENQ_EXITING */
 	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
 	    unlikely(p->flags & PF_EXITING))
@@ -1609,7 +1616,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
 	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
 		return 1;
 
-	if (!SCX_HAS_OP(dispatch))
+	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing())
 		return 0;
 
 	dspc->rq = rq;
@@ -3026,16 +3033,6 @@ bool task_should_scx(struct task_struct *p)
 	return p->policy == SCHED_EXT;
 }
 
-static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
-{
-	if (enq_flags & SCX_ENQ_LAST)
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-	else
-		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-}
-
-static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
-
 /**
  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
  *
@@ -3048,16 +3045,17 @@ static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
  * the DISABLING state and then cycling the tasks through dequeue/enqueue to
  * force global FIFO scheduling.
  *
- * a. ops.enqueue() and .dispatch() are overridden for simple global FIFO
- *    scheduling.
+ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
  *
- * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
+ * b. ops.dispatch() is ignored.
+ *
+ * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
  *    trusted. Whenever a tick triggers, the running task is rotated to the tail
  *    of the queue with core_sched_at touched.
  *
- * c. pick_next_task() suppresses zero slice warning.
+ * d. pick_next_task() suppresses zero slice warning.
  *
- * d. scx_prio_less() reverts to the default core_sched_at order.
+ * e. scx_prio_less() reverts to the default core_sched_at order.
  */
 static void scx_ops_bypass(bool bypass)
 {
@@ -3070,9 +3068,6 @@ static void scx_ops_bypass(bool bypass)
 		WARN_ON_ONCE(depth <= 0);
 		if (depth != 1)
 			return;
-
-		scx_ops.enqueue = scx_ops_fallback_enqueue;
-		scx_ops.dispatch = scx_ops_fallback_dispatch;
 	} else {
 		depth = atomic_dec_return(&scx_ops_bypass_depth);
 		WARN_ON_ONCE(depth < 0);
@@ -3086,7 +3081,7 @@ static void scx_ops_bypass(bool bypass)
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
 			struct sched_enq_and_set_ctx ctx;
 
-			/* cycling deq/enq is enough, see above */
+			/* cycling deq/enq is enough, see the function comment */
 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 			sched_enq_and_set_task(&ctx);
 		}

From 8583a03659689015dba50abb3566f06a7f7be6fc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 243/304] scx: Optimize scx_ops_bypass()

scx_ops_bypass() involves scanning all tasks in the system and can thus
become pretty expensive which limits its utility. scx_ops_bypass() isn't
making any persistent changes to tasks. It just wants to dequeue and
re-enqueue runnable tasks so that they're queued according to the current
bypass state. As such, it can iterate the runnable tasks rather than all.

This patch makes scx_ops_bypass() iterate each CPU's rq->scx.runnable_list.
There are subtle complications due to the inability to trust the scheduler
and each task going off and getting back on the runnable_list as they get
cycled. See the comments for details.

After this optimization, [un]bypassing should be pretty cheap in most
circumstances.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 55 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c7e00baec5194..97d67becd9a8a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -962,9 +962,16 @@ static bool task_runnable(const struct task_struct *p)
 static void set_task_runnable(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
-	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT)
+
+	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
 		p->scx.runnable_at = jiffies;
-	p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+	}
+
+	/*
+	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+	 * appened to the runnable_list.
+	 */
 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
 }
 
@@ -3042,8 +3049,8 @@ bool task_should_scx(struct task_struct *p)
  * unfortunately also excludes toggling the static branches.
  *
  * Let's work around by overriding a couple ops and modifying behaviors based on
- * the DISABLING state and then cycling the tasks through dequeue/enqueue to
- * force global FIFO scheduling.
+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
+ * to force global FIFO scheduling.
  *
  * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
  *
@@ -3059,8 +3066,6 @@ bool task_should_scx(struct task_struct *p)
  */
 static void scx_ops_bypass(bool bypass)
 {
-	struct scx_task_iter sti;
-	struct task_struct *p;
 	int depth, cpu;
 
 	if (bypass) {
@@ -3075,23 +3080,43 @@ static void scx_ops_bypass(bool bypass)
 			return;
 	}
 
-	spin_lock_irq(&scx_tasks_lock);
-	scx_task_iter_init(&sti);
-	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
-		if (READ_ONCE(p->__state) != TASK_DEAD) {
+	/*
+	 * No task property is changing. We just need to make sure all currently
+	 * queued tasks are re-queued according to the new scx_ops_bypassing()
+	 * state. As an optimization, walk each rq's runnable_list instead of
+	 * the scx_tasks list.
+	 *
+	 * This function can't trust the scheduler and thus can't use
+	 * cpus_read_lock(). Walk all possible CPUs instead of online.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct rq_flags rf;
+		struct task_struct *p, *n;
+
+		rq_lock_irqsave(rq, &rf);
+
+		/*
+		 * The use of list_for_each_entry_safe_reverse() is required
+		 * because each task is going to be removed from and added back
+		 * to the runnable_list during iteration. Because they're added
+		 * to the tail of the list, safe reverse iteration can still
+		 * visit all nodes.
+		 */
+		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
+						 scx.runnable_node) {
 			struct sched_enq_and_set_ctx ctx;
 
 			/* cycling deq/enq is enough, see the function comment */
 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 			sched_enq_and_set_task(&ctx);
 		}
-	}
-	scx_task_iter_exit(&sti);
-	spin_unlock_irq(&scx_tasks_lock);
 
-	/* kick all CPUs to restore ticks */
-	for_each_possible_cpu(cpu)
+		rq_unlock_irqrestore(rq, &rf);
+
+		/* kick to restore ticks */
 		resched_cpu(cpu);
+	}
 }
 
 static void scx_ops_disable_workfn(struct kthread_work *work)

From 303c346c108ad559853ed0d9503bca9e1c59da62 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 244/304] scx: Expose bypassing state to userland

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 97d67becd9a8a..b797e7ea0dcf0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3606,6 +3606,7 @@ static int scx_debug_show(struct seq_file *m, void *v)
 	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
 	seq_printf(m, "%-30s: %s\n", "enable_state",
 		   scx_ops_enable_state_str[scx_ops_enable_state()]);
+	seq_printf(m, "%-30s: %d\n", "bypassing", scx_ops_bypassing());
 	seq_printf(m, "%-30s: %lu\n", "nr_rejected",
 		   atomic_long_read(&scx_nr_rejected));
 	mutex_unlock(&scx_ops_enable_mutex);

From a37ef8e6444e53d346e98b48aaea2eab53fa11df Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 245/304] scx: s/register_ext_kfuncs()/scx_init()/

We need more stuff to do in the init function. Give it a more generic name.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b797e7ea0dcf0..577a38b1155e9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4638,15 +4638,14 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
 
 __diag_pop();
 
-/*
- * This can't be done from init_sched_ext_class() as register_btf_kfunc_id_set()
- * needs most of the system to be up.
- */
-static int __init register_ext_kfuncs(void)
+static int __init scx_init(void)
 {
 	int ret;
 
 	/*
+	 * kfunc registration can't be done from init_sched_ext_class() as
+	 * register_btf_kfunc_id_set() needs most of the system to be up.
+	 *
 	 * Some kfuncs are context-sensitive and can only be called from
 	 * specific SCX ops. They are grouped into BTF sets accordingly.
 	 * Unfortunately, BPF currently doesn't have a way of enforcing such
@@ -4676,4 +4675,4 @@ static int __init register_ext_kfuncs(void)
 
 	return 0;
 }
-__initcall(register_ext_kfuncs);
+__initcall(scx_init);

From df28190349770642a3dfead1467313a8430c38f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:23 -1000
Subject: [PATCH 246/304] scx: Bypass while PM operations are in progress

SCX schedulers often have userspace components which are sometimes involved
in critial scheduling paths. PM operations involve freezing userspace which
can lead to scheduling misbehaviors including stalls. Let's bypass while PM
operations are in progress.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andrea Righi <andrea.righi@canonical.com>
---
 kernel/sched/build_policy.c |  1 +
 kernel/sched/ext.c          | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 005025f55beaa..96ea08f76603a 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -22,6 +22,7 @@
 #include <linux/cpuidle.h>
 #include <linux/jiffies.h>
 #include <linux/livepatch.h>
+#include <linux/pm.h>
 #include <linux/psi.h>
 #include <linux/seqlock_api.h>
 #include <linux/slab.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 577a38b1155e9..238e2b53907af 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3914,6 +3914,37 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 	       runnable_at_buf);
 }
 
+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+	if (!scx_enabled())
+		return NOTIFY_OK;
+
+	/*
+	 * SCX schedulers often have userspace components which are sometimes
+	 * involved in critial scheduling paths. PM operations involve freezing
+	 * userspace which can lead to scheduling misbehaviors including stalls.
+	 * Let's bypass while PM operations are in progress.
+	 */
+	switch (event) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+	case PM_RESTORE_PREPARE:
+		scx_ops_bypass(true);
+		break;
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+	case PM_POST_RESTORE:
+		scx_ops_bypass(false);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block scx_pm_notifier = {
+	.notifier_call = scx_pm_handler,
+};
+
 void __init init_sched_ext_class(void)
 {
 	int cpu;
@@ -4669,10 +4700,14 @@ static int __init scx_init(void)
 					     &scx_kfunc_set_any)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
 					     &scx_kfunc_set_any))) {
-		pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
+		pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
 		return ret;
 	}
 
+	ret = register_pm_notifier(&scx_pm_notifier);
+	if (ret)
+		pr_warn("sched_ext: Failed to register PM notifier (%d)\n", ret);
+
 	return 0;
 }
 __initcall(scx_init);

From a62d59c6ba63803562e3d5adb7bbdb846e97e53d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 10:21:24 -1000
Subject: [PATCH 247/304] scx: Disabling scx_bpf_kick_cpu() while bypassing

scx_bpf_kick_cpu() uses irq_work. However, if called while e.g. suspending,
IRQ handling may already be offline and scheduling irq_work can hang
indefinitely. There's no need for kicking while bypassing anyway, let's
suppress scx_bpf_kick_cpu() while bypassing.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 238e2b53907af..98dcaafb2834b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3063,6 +3063,9 @@ bool task_should_scx(struct task_struct *p)
  * d. pick_next_task() suppresses zero slice warning.
  *
  * e. scx_prio_less() reverts to the default core_sched_at order.
+ *
+ * f. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ *    operations.
  */
 static void scx_ops_bypass(bool bypass)
 {
@@ -4319,6 +4322,14 @@ void scx_bpf_kick_cpu(s32 cpu, u64 flags)
 		return;
 	}
 
+	/*
+	 * While bypassing for PM ops, IRQ handling may not be online which can
+	 * lead to irq_work_queue() malfunction such as infinite busy wait for
+	 * IRQ status update. Suppress kicking.
+	 */
+	if (scx_ops_bypassing())
+		return;
+
 	preempt_disable();
 	rq = this_rq();
 

From 8588d4ff430e38aa9686fa691c7c8e17177422a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 11:36:17 -1000
Subject: [PATCH 248/304] Revert "scx: Avoid possible deadlock with
 cpus_read_lock()"

This reverts commit c3c7041b04c7fe1e80f7a558fed494f82f204618.

We hit a locking ordering issue in the other direction. Let's revert for
now.

[    9.378773] ======================================================
[    9.379476] WARNING: possible circular locking dependency detected
[    9.379532] 6.6.0-work-10442-ga7150a9168f8-dirty #134 Not tainted
[    9.379532] ------------------------------------------------------
[    9.379532] scx_rustland/1622 is trying to acquire lock:
[    9.379532] ffffffff8325f828 (cpu_hotplug_lock){++++}-{0:0}, at: bpf_scx_reg+0xe4/0xcf0
[    9.379532]
[    9.379532] but task is already holding lock:
[    9.379532] ffffffff83271be8 (scx_cgroup_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xdf/0xcf0
[    9.379532]
[    9.379532] which lock already depends on the new lock.
[    9.379532]
[    9.379532]
[    9.379532] the existing dependency chain (in reverse order) is:
[    9.379532]
[    9.379532] -> #2 (scx_cgroup_rwsem){++++}-{0:0}:
[    9.379532]        percpu_down_read+0x2e/0xb0
[    9.379532]        scx_cgroup_can_attach+0x25/0x200
[    9.379532]        cpu_cgroup_can_attach+0xe/0x10
[    9.379532]        cgroup_migrate_execute+0xaf/0x450
[    9.379532]        cgroup_apply_control+0x227/0x2a0
[    9.379532]        cgroup_subtree_control_write+0x425/0x4b0
[    9.379532]        cgroup_file_write+0x82/0x260
[    9.379532]        kernfs_fop_write_iter+0x131/0x1c0
[    9.379532]        vfs_write+0x1f9/0x270
[    9.379532]        ksys_write+0x62/0xc0
[    9.379532]        __x64_sys_write+0x1b/0x20
[    9.379532]        do_syscall_64+0x40/0xe0
[    9.379532]        entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532]
[    9.379532] -> #1 (cgroup_threadgroup_rwsem){++++}-{0:0}:
[    9.379532]        percpu_down_write+0x35/0x1e0
[    9.379532]        cgroup_procs_write_start+0x8a/0x210
[    9.379532]        __cgroup_procs_write+0x4c/0x160
[    9.379532]        cgroup_procs_write+0x17/0x30
[    9.379532]        cgroup_file_write+0x82/0x260
[    9.379532]        kernfs_fop_write_iter+0x131/0x1c0
[    9.379532]        vfs_write+0x1f9/0x270
[    9.379532]        ksys_write+0x62/0xc0
[    9.379532]        __x64_sys_write+0x1b/0x20
[    9.379532]        do_syscall_64+0x40/0xe0
[    9.379532]        entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532]
[    9.379532] -> #0 (cpu_hotplug_lock){++++}-{0:0}:
[    9.379532]        __lock_acquire+0x142d/0x2a30
[    9.379532]        lock_acquire+0xbf/0x1f0
[    9.379532]        cpus_read_lock+0x2f/0xc0
[    9.379532]        bpf_scx_reg+0xe4/0xcf0
[    9.379532]        bpf_struct_ops_link_create+0xb6/0x100
[    9.379532]        link_create+0x49/0x200
[    9.379532]        __sys_bpf+0x351/0x3e0
[    9.379532]        __x64_sys_bpf+0x1c/0x20
[    9.379532]        do_syscall_64+0x40/0xe0
[    9.379532]        entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532]
[    9.379532] other info that might help us debug this:
[    9.379532]
[    9.379532] Chain exists of:
[    9.379532]   cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
[    9.379532]
[    9.379532]  Possible unsafe locking scenario:
[    9.379532]
[    9.379532]        CPU0                    CPU1
[    9.379532]        ----                    ----
[    9.379532]   lock(scx_cgroup_rwsem);
[    9.379532]                                lock(cgroup_threadgroup_rwsem);
[    9.379532]                                lock(scx_cgroup_rwsem);
[    9.379532]   rlock(cpu_hotplug_lock);
[    9.379532]
[    9.379532]  *** DEADLOCK ***
[    9.379532]
[    9.379532] 3 locks held by scx_rustland/1622:
[    9.379532]  #0: ffffffff83272708 (scx_ops_enable_mutex){+.+.}-{3:3}, at: bpf_scx_reg+0x2a/0xcf0
[    9.379532]  #1: ffffffff83271aa0 (scx_fork_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xd3/0xcf0
[    9.379532]  #2: ffffffff83271be8 (scx_cgroup_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xdf/0xcf0
[    9.379532]
[    9.379532] stack backtrace:
[    9.379532] CPU: 7 PID: 1622 Comm: scx_rustland Not tainted 6.6.0-work-10442-ga7150a9168f8-dirty #134
[    9.379532] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
[    9.379532] Sched_ext: rustland (prepping)
[    9.379532] Call Trace:
[    9.379532]  <TASK>
[    9.379532]  dump_stack_lvl+0x55/0x70
[    9.379532]  dump_stack+0x10/0x20
[    9.379532]  print_circular_bug+0x2ea/0x2f0
[    9.379532]  check_noncircular+0xe2/0x100
[    9.379532]  __lock_acquire+0x142d/0x2a30
[    9.379532]  ? lock_acquire+0xbf/0x1f0
[    9.379532]  ? rcu_sync_func+0x2c/0xa0
[    9.379532]  lock_acquire+0xbf/0x1f0
[    9.379532]  ? bpf_scx_reg+0xe4/0xcf0
[    9.379532]  cpus_read_lock+0x2f/0xc0
[    9.379532]  ? bpf_scx_reg+0xe4/0xcf0
[    9.379532]  bpf_scx_reg+0xe4/0xcf0
[    9.379532]  ? alloc_file+0xa4/0x160
[    9.379532]  ? alloc_file_pseudo+0x99/0xd0
[    9.379532]  ? anon_inode_getfile+0x79/0xc0
[    9.379532]  ? bpf_link_prime+0xe2/0x1a0
[    9.379532]  bpf_struct_ops_link_create+0xb6/0x100
[    9.379532]  link_create+0x49/0x200
[    9.379532]  __sys_bpf+0x351/0x3e0
[    9.379532]  __x64_sys_bpf+0x1c/0x20
[    9.379532]  do_syscall_64+0x40/0xe0
[    9.379532]  ? sysvec_apic_timer_interrupt+0x44/0x80
[    9.379532]  entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532] RIP: 0033:0x7fc391f7473d
[    9.379532] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 95 0c 00 f7 d8 64 89 01 48
[    9.379532] RSP: 002b:00007ffeb4fe4108 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
[    9.379532] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc391f7473d
[    9.379532] RDX: 0000000000000030 RSI: 00007ffeb4fe4120 RDI: 000000000000001c
[    9.379532] RBP: 000000000000000c R08: 000000000000000c R09: 000055d0a75b1a10
[    9.379532] R10: 0000000000000050 R11: 0000000000000246 R12: 000000000000002c
[    9.379532] R13: 00007ffeb4fe4628 R14: 0000000000000000 R15: 00007ffeb4fe4328
[    9.379532]  </TASK>
---
 kernel/sched/ext.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 98dcaafb2834b..53024ebd18094 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3198,9 +3198,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	WRITE_ONCE(scx_switching_all, false);
 
 	/* avoid racing against fork and cgroup changes */
+	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
-	cpus_read_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -3239,9 +3239,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	scx_cgroup_exit();
 
-	cpus_read_unlock();
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
+	cpus_read_unlock();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3399,18 +3399,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	atomic_long_set(&scx_nr_rejected, 0);
 
 	/*
-	 * Lock out forks, cgroup on/offlining and moves before opening the
-	 * floodgate so that they don't wander into the operations prematurely.
-	 *
-	 * Also keep CPUs stable during enable so that the BPF scheduler can
-	 * track online CPUs by watching ->on/offline_cpu() after ->init().
-	 *
-	 * Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
-	 * cpus_read_lock() is acquired in a ton of places, so let's be a bit
-	 * cautious to avoid possible deadlock.
+	 * Keep CPUs stable during enable so that the BPF scheduler can track
+	 * online CPUs by watching ->on/offline_cpu() after ->init().
 	 */
-	percpu_down_write(&scx_fork_rwsem);
-	scx_cgroup_lock();
 	cpus_read_lock();
 
 	scx_switch_all_req = false;
@@ -3456,6 +3447,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
 			   scx_watchdog_timeout / 2);
 
+	/*
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
+	 */
+	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
+
 	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -3481,7 +3479,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	ret = scx_cgroup_init();
 	if (ret)
-		goto err_disable;
+		goto err_disable_unlock;
 
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
@@ -3507,7 +3505,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			spin_unlock_irq(&scx_tasks_lock);
 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
-			goto err_disable;
+			goto err_disable_unlock;
 		}
 
 		put_task_struct(p);
@@ -3531,7 +3529,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		preempt_enable();
 		spin_unlock_irq(&scx_tasks_lock);
 		ret = -EBUSY;
-		goto err_disable;
+		goto err_disable_unlock;
 	}
 
 	/*
@@ -3565,6 +3563,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;
@@ -3575,8 +3575,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		static_branch_enable_cpuslocked(&__scx_switched_all);
 
 	cpus_read_unlock();
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
@@ -3587,10 +3585,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
-err_disable:
-	cpus_read_unlock();
+err_disable_unlock:
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
+err_disable:
+	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 	/* must be fully disabled before returning */
 	scx_ops_disable(SCX_EXIT_ERROR);

From 22c362701469ea8c222444455286c6e201a8c268 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 11:51:31 -1000
Subject: [PATCH 249/304] scx: Make scx_task_state handling more idiomatic

Functionally equivalent. Just a bit more idiomatic.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 34 ++++++++++++++--------------------
 kernel/sched/ext.c        | 34 +++++++++++++++-------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 90383453d65d1..5f457194e3a8c 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -613,15 +613,24 @@ enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
 	SCX_TASK_DDSP_PRIQ	= 1 << 2, /* task should be enqueued on priq when directly dispatched */
-	SCX_TASK_STATE_0	= 1 << 3, /* first bit encoding the task's current state */
-	SCX_TASK_STATE_1	= 1 << 4, /* second bit encoding the task's current state */
+	SCX_TASK_RESET_RUNNABLE_AT = 1 << 3, /* runnable_at should be reset */
+	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 4, /* last dequeue was for SLEEP */
 
-	SCX_TASK_RESET_RUNNABLE_AT = 1 << 16, /* runnable_at should be reset */
-	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 17, /* last dequeue was for SLEEP */
+	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
+	SCX_TASK_STATE_BITS	= 2,
+	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
 
 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
+};
+
+/* scx_entity.flags & SCX_TASK_STATE_MASK */
+enum scx_task_state {
+	SCX_TASK_NONE,		/* ops.init_task() not called yet */
+	SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */
+	SCX_TASK_READY,		/* fully initialized, but not in sched_ext */
+	SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */
 
-	SCX_TASK_STATE_MASK	= SCX_TASK_STATE_0 | SCX_TASK_STATE_1,
+	SCX_TASK_NR_STATES,
 };
 
 /* scx_entity.dsq_flags */
@@ -655,21 +664,6 @@ enum scx_kf_mask {
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
-/* scx_entity.task_state */
-enum scx_task_state {
-	/* ops.prep_enable() has not yet been called on task */
-	SCX_TASK_NONE,
-
-	/* ops.prep_enable() succeeded on task, but it still be cancelled */
-	SCX_TASK_INIT,
-
-	/* Task is fully initialized, but not being scheduled in sched_ext */
-	SCX_TASK_READY,
-
-	/* Task is fully initialized and is being scheduled in sched_ext */
-	SCX_TASK_ENABLED,
-};
-
 /*
  * The following is embedded in task_struct and contains all fields necessary
  * for a task to be scheduled by SCX.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 53024ebd18094..2a552efeec6f3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2313,41 +2313,34 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
 
 static enum scx_task_state scx_get_task_state(const struct task_struct *p)
 {
-	int state = p->scx.flags & SCX_TASK_STATE_MASK;
-
-	switch (state) {
-	case SCX_TASK_STATE_0 | SCX_TASK_STATE_1:
-		return SCX_TASK_ENABLED;
-	case SCX_TASK_STATE_1:
-		return SCX_TASK_READY;
-	case SCX_TASK_STATE_0:
-		return SCX_TASK_INIT;
-	default:
-		return SCX_TASK_NONE;
-	}
+	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
 }
 
 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
 {
 	enum scx_task_state prev_state = scx_get_task_state(p);
 
-	p->scx.flags &= ~SCX_TASK_STATE_MASK;
+	BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
+
 	switch (state) {
 	case SCX_TASK_NONE:
-		return;
+		break;
 	case SCX_TASK_INIT:
 		WARN_ON_ONCE(prev_state != SCX_TASK_NONE);
-		p->scx.flags |= SCX_TASK_STATE_0;
-		return;
+		break;
 	case SCX_TASK_READY:
 		WARN_ON_ONCE(prev_state == SCX_TASK_NONE);
-		p->scx.flags |= SCX_TASK_STATE_1;
-		return;
+		break;
 	case SCX_TASK_ENABLED:
 		WARN_ON_ONCE(prev_state != SCX_TASK_READY);
-		p->scx.flags |= (SCX_TASK_STATE_0 | SCX_TASK_STATE_1);
+		break;
+	default:
+		WARN_ON_ONCE(true);
 		return;
 	}
+
+	p->scx.flags &= ~SCX_TASK_STATE_MASK;
+	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
 }
 
 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg)
@@ -2447,6 +2440,9 @@ static void scx_ops_exit_task(struct task_struct *p)
 	case SCX_TASK_ENABLED:
 		scx_ops_disable_task(p);
 		break;
+	default:
+		WARN_ON_ONCE(true);
+		return;
 	}
 
 	if (SCX_HAS_OP(exit_task))

From 88e756003865b0be12062395b14384952e962883 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jan 2024 19:05:19 -1000
Subject: [PATCH 250/304] scx: Sync schedulers from SCX v0.1.5 (74923c6cdbc3)

---
 tools/sched_ext/include/scx/common.bpf.h      |   3 +-
 tools/sched_ext/scx_central.bpf.c             |   8 ++
 tools/sched_ext/scx_central.c                 |   7 +-
 tools/sched_ext/scx_flatcg.bpf.c              |  41 ++++--
 tools/sched_ext/scx_flatcg.c                  |   4 +-
 tools/sched_ext/scx_layered/Cargo.toml        |   8 +-
 tools/sched_ext/scx_layered/README.md         |  37 ++++++
 .../sched_ext/scx_layered/src/bpf/main.bpf.c  |  19 +--
 tools/sched_ext/scx_layered/src/main.rs       |  20 +--
 tools/sched_ext/scx_pair.c                    |   9 +-
 tools/sched_ext/scx_qmap.bpf.c                |  10 +-
 tools/sched_ext/scx_qmap.c                    |   3 +-
 tools/sched_ext/scx_rusty/Cargo.toml          |   8 +-
 tools/sched_ext/scx_rusty/README.md           |  36 ++++++
 tools/sched_ext/scx_rusty/src/bpf/main.bpf.c  |  29 ++---
 tools/sched_ext/scx_rusty/src/main.rs         |  67 +++++-----
 tools/sched_ext/scx_simple.bpf.c              |  25 ++--
 tools/sched_ext/scx_userland.bpf.c            | 119 +++++++++++++++---
 tools/sched_ext/scx_userland.c                |  81 ++++++++++--
 tools/sched_ext/scx_userland.h                |   2 -
 20 files changed, 392 insertions(+), 144 deletions(-)
 create mode 100644 tools/sched_ext/scx_layered/README.md
 create mode 100644 tools/sched_ext/scx_rusty/README.md

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 5c503c2358368..f2336d357106e 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -10,7 +10,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include <linux/errno.h>
+#include <asm-generic/errno.h>
 #include "user_exit_info.h"
 
 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
@@ -68,6 +68,7 @@ const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 4f398249fb2cc..51ddb0a14bc61 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -161,6 +161,14 @@ static bool dispatch_to_cpu(s32 cpu)
 			__sync_fetch_and_add(&nr_mismatches, 1);
 			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
+			/*
+			 * We might run out of dispatch buffer slots if we continue dispatching
+			 * to the fallback DSQ, without dispatching to the local DSQ of the
+			 * target CPU. In such a case, break the loop now as will fail the
+			 * next dispatch operation.
+			 */
+			if (!scx_bpf_dispatch_nr_slots())
+				break;
 			continue;
 		}
 
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index a3d22409e9ce5..501505001bf98 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -8,6 +8,7 @@
 #include <sched.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -103,17 +104,17 @@ int main(int argc, char **argv)
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
-		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
+		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_locals,
 		       skel->bss->nr_queued,
 		       skel->bss->nr_lost_pids);
-		printf("timer   :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+		printf("timer   :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
 		       skel->bss->nr_timers,
 		       skel->bss->nr_dispatches,
 		       skel->bss->nr_mismatches,
 		       skel->bss->nr_retries);
-		printf("overflow:%10lu\n",
+		printf("overflow:%10" PRIu64 "\n",
 		       skel->bss->nr_overflows);
 		fflush(stdout);
 		sleep(1);
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 84a60d7e4024b..869115805b288 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -123,7 +123,7 @@ struct {
 } task_ctx SEC(".maps");
 
 /* gets inc'd on weight tree changes to expire the cached hweights */
-unsigned long hweight_gen = 1;
+u64 hweight_gen = 1;
 
 static u64 div_round_up(u64 dividend, u64 divisor)
 {
@@ -302,16 +302,18 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	bpf_spin_unlock(&cgv_tree_lock);
 }
 
-void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	struct fcg_task_ctx *taskc;
-	struct cgroup *cgrp;
-	struct fcg_cgrp_ctx *cgc;
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 
 	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
 	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
-		return;
+		return cpu;
 	}
 
 	/*
@@ -321,7 +323,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 	 * affinities so that we don't have to worry about per-cgroup dq's
 	 * containing tasks that can't be executed from some CPUs.
 	 */
-	if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
+	if (is_idle || p->nr_cpus_allowed != nr_cpus) {
 		/*
 		 * Tell fcg_stopping() that this bypassed the regular scheduling
 		 * path and should be force charged to the cgroup. 0 is used to
@@ -338,14 +340,28 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		 * implement per-cgroup fallback dq's instead so that we have
 		 * more control over when tasks with custom cpumask get issued.
 		 */
-		if ((enq_flags & SCX_ENQ_LOCAL) ||
+		if (is_idle ||
 		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
 			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 		} else {
 			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
 		}
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
 		return;
 	}
 
@@ -756,8 +772,8 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
-s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct fcg_task_ctx *taskc;
 	struct fcg_cgrp_ctx *cgc;
@@ -893,13 +909,14 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 
 SEC(".struct_ops.link")
 struct sched_ext_ops flatcg_ops = {
+	.select_cpu		= (void *)fcg_select_cpu,
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
 	.runnable		= (void *)fcg_runnable,
 	.running		= (void *)fcg_running,
 	.stopping		= (void *)fcg_stopping,
 	.quiescent		= (void *)fcg_quiescent,
-	.prep_enable		= (void *)fcg_prep_enable,
+	.init_task		= (void *)fcg_init_task,
 	.cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
 	.cgroup_init		= (void *)fcg_cgroup_init,
 	.cgroup_exit		= (void *)fcg_cgroup_exit,
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 6a6e47c83ede7..b326b2d3ec350 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -9,6 +9,7 @@
 #include <unistd.h>
 #include <libgen.h>
 #include <limits.h>
+#include <inttypes.h>
 #include <fcntl.h>
 #include <time.h>
 #include <bpf/bpf.h>
@@ -183,7 +184,7 @@ int main(int argc, char **argv)
 
 		memcpy(last_stats, acc_stats, sizeof(acc_stats));
 
-		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
 		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
 		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
 		       stats[FCG_STAT_ACT],
@@ -210,6 +211,7 @@ int main(int argc, char **argv)
 		       stats[FCG_STAT_PNC_GONE]);
 		printf("BAD remove:%6llu\n",
 		       acc_stats[FCG_STAT_BAD_REMOVAL]);
+		fflush(stdout);
 
 		nanosleep(&intv_ts, NULL);
 	}
diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
index 19dd0243a9f2a..37a811e3807e2 100644
--- a/tools/sched_ext/scx_layered/Cargo.toml
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "scx_layered"
-version = "0.0.1"
+version = "0.0.4"
 authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
 edition = "2021"
 description = "Userspace scheduling with BPF for Ads"
@@ -13,16 +13,16 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7"
 lazy_static = "1.4"
-libbpf-rs = "0.21"
+libbpf-rs = "0.22"
 libc = "0.2"
 log = "0.4"
-scx_utils = "0.3"
+scx_utils = "0.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 simplelog = "0.12"
 
 [build-dependencies]
-scx_utils = "0.3"
+scx_utils = "0.5"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_layered/README.md b/tools/sched_ext/scx_layered/README.md
new file mode 100644
index 0000000000000..37c554b2354db
--- /dev/null
+++ b/tools/sched_ext/scx_layered/README.md
@@ -0,0 +1,37 @@
+# scx_layered
+
+This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
+
+## Overview
+
+A highly configurable multi-layer BPF / user space hybrid scheduler.
+
+scx_layered allows the user to classify tasks into multiple layers, and apply
+different scheduling policies to those layers. For example, a layer could be
+created of all tasks that are part of the `user.slice` cgroup slice, and a
+policy could be specified that ensures that the layer is given at least 80% CPU
+utilization for some subset of CPUs on the system.
+
+## How To Install
+
+Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered`
+
+## Typical Use Case
+
+scx_layered is designed to be highly customizable, and can be targeted for
+specific applications. For example, if you had a high-priority service that
+required priority access to all but 1 physical core to ensure acceptable p99
+latencies, you could specify that the service would get priority access to all
+but 1 core on the system. If that service ends up not utilizing all of those
+cores, they could be used by other layers until they're needed.
+
+## Production Ready?
+
+Yes. If tuned correctly, scx_layered should be performant across various CPU
+architectures and workloads.
+
+That said, you may run into an issue with infeasible weights, where a task with
+a very high weight may cause the scheduler to incorrectly leave cores idle
+because it thinks they're necessary to accommodate the compute for a single
+task. This can also happen in CFS, and should soon be addressed for
+scx_layered.
diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index 98d9418e1adf1..21dd0e4cd8395 100644
--- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -745,8 +745,8 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
 		bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
-s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct task_ctx tctx_init = {
 		.pid = p->pid,
@@ -805,14 +805,8 @@ s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
 	return 0;
 }
 
-void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
-{
-	s32 pid = p->pid;
-
-	bpf_map_delete_elem(&task_ctxs, &pid);
-}
-
-void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
+void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
 {
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
@@ -977,9 +971,8 @@ struct sched_ext_ops layered = {
 	.quiescent		= (void *)layered_quiescent,
 	.set_weight		= (void *)layered_set_weight,
 	.set_cpumask		= (void *)layered_set_cpumask,
-	.prep_enable		= (void *)layered_prep_enable,
-	.cancel_enable		= (void *)layered_cancel_enable,
-	.disable		= (void *)layered_disable,
+	.init_task		= (void *)layered_init_task,
+	.exit_task		= (void *)layered_exit_task,
 	.init			= (void *)layered_init,
 	.exit			= (void *)layered_exit,
 	.name			= "layered",
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 8f4d77db04ea9..5b5374226f49a 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -1122,10 +1122,10 @@ struct Scheduler<'a> {
 
 impl<'a> Scheduler<'a> {
     fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec<LayerSpec>) -> Result<()> {
-        skel.rodata().nr_layers = specs.len() as u32;
+        skel.rodata_mut().nr_layers = specs.len() as u32;
 
         for (spec_i, spec) in specs.iter().enumerate() {
-            let layer = &mut skel.bss().layers[spec_i];
+            let layer = &mut skel.bss_mut().layers[spec_i];
 
             for (or_i, or) in spec.matches.iter().enumerate() {
                 for (and_i, and) in or.iter().enumerate() {
@@ -1176,12 +1176,12 @@ impl<'a> Scheduler<'a> {
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
         // Initialize skel according to @opts.
-        skel.rodata().debug = opts.verbose as u32;
-        skel.rodata().slice_ns = opts.slice_us * 1000;
-        skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
-        skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
+        skel.rodata_mut().debug = opts.verbose as u32;
+        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
+        skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
+        skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
         for cpu in cpu_pool.all_cpus.iter_ones() {
-            skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
+            skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8);
         }
         Self::init_layers(&mut skel, &layer_specs)?;
 
@@ -1274,7 +1274,7 @@ impl<'a> Scheduler<'a> {
                     {
                         Self::update_bpf_layer_cpumask(
                             &self.layers[idx],
-                            &mut self.skel.bss().layers[idx],
+                            &mut self.skel.bss_mut().layers[idx],
                         );
                         updated = true;
                     }
@@ -1288,7 +1288,7 @@ impl<'a> Scheduler<'a> {
             let nr_available_cpus = available_cpus.count_ones();
             for idx in 0..self.layers.len() {
                 let layer = &mut self.layers[idx];
-                let bpf_layer = &mut self.skel.bss().layers[idx];
+                let bpf_layer = &mut self.skel.bss_mut().layers[idx];
                 match &layer.kind {
                     LayerKind::Open { .. } => {
                         layer.cpus.copy_from_bitslice(&available_cpus);
@@ -1299,7 +1299,7 @@ impl<'a> Scheduler<'a> {
                 }
             }
 
-            self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
+            self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
 
             for (lidx, layer) in self.layers.iter().enumerate() {
                 self.nr_layer_cpus_min_max[lidx] = (
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 693f095b8c660..1eb30efeb0ed5 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -6,6 +6,7 @@
  */
 #include <stdio.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -142,18 +143,18 @@ int main(int argc, char **argv)
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
-		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
+		printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 "   missing:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_dispatched,
 		       skel->bss->nr_missing);
-		printf(" kicks:%10lu preemptions:%7lu\n",
+		printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
 		       skel->bss->nr_kicks,
 		       skel->bss->nr_preemptions);
-		printf("   exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
+		printf("   exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
 		       skel->bss->nr_exps,
 		       skel->bss->nr_exp_waits,
 		       skel->bss->nr_exp_empty);
-		printf("cgnext:%10lu   cgcoll:%10lu   cgempty:%10lu\n",
+		printf("cgnext:%10" PRIu64 "   cgcoll:%10" PRIu64 "   cgempty:%10" PRIu64 "\n",
 		       skel->bss->nr_cgrp_next,
 		       skel->bss->nr_cgrp_coll,
 		       skel->bss->nr_cgrp_empty);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 831df3f644d5a..2fb75543a1640 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -95,8 +95,8 @@ struct {
 } dispatch_idx_cnt SEC(".maps");
 
 /* Statistics */
-unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
-unsigned long nr_core_sched_execed;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
+u64 nr_core_sched_execed;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -354,8 +354,8 @@ void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args
 		__sync_fetch_and_add(&nr_reenqueued, cnt);
 }
 
-s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	if (p->tgid == disallow_tgid)
 		p->scx.disallow = true;
@@ -391,7 +391,7 @@ struct sched_ext_ops qmap_ops = {
 	.dispatch		= (void *)qmap_dispatch,
 	.core_sched_before	= (void *)qmap_core_sched_before,
 	.cpu_release		= (void *)qmap_cpu_release,
-	.prep_enable		= (void *)qmap_prep_enable,
+	.init_task		= (void *)qmap_init_task,
 	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
 	.flags			= SCX_OPS_ENQ_LAST,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index d817115c0b0a8..7008b91386449 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -90,7 +91,7 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu, core=%lu\n",
+		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed);
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index 309643687d0c6..a8b4231d1bde9 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "scx_rusty"
-version = "0.5.0"
+version = "0.5.3"
 authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
 edition = "2021"
 description = "Userspace scheduling with BPF"
@@ -13,15 +13,15 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7.0"
 hex = "0.4.3"
-libbpf-rs = "0.21.0"
+libbpf-rs = "0.22.0"
 libc = "0.2.137"
 log = "0.4.17"
 ordered-float = "3.4.0"
-scx_utils = "0.3"
+scx_utils = "0.5"
 simplelog = "0.12.0"
 
 [build-dependencies]
-scx_utils = "0.3"
+scx_utils = "0.5"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_rusty/README.md b/tools/sched_ext/scx_rusty/README.md
new file mode 100644
index 0000000000000..990e51aaf43b3
--- /dev/null
+++ b/tools/sched_ext/scx_rusty/README.md
@@ -0,0 +1,36 @@
+# scx_rusty
+
+This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
+
+## Overview
+
+A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
+scheduler does a simple round robin in each domain, and the user space portion
+(written in Rust) calculates the load factor of each domain, and informs BPF of
+how tasks should be load balanced accordingly.
+
+## How To Install
+
+Available as a [Rust crate](https://crates.io/crates/scx_rusty): `cargo add scx_rusty`
+
+## Typical Use Case
+
+Rusty is designed to be flexible, and accommodate different architectures and
+workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
+as well as how Rusty should partition the system into scheduling domains, can
+be tuned to achieve the optimal configuration for any given system or workload.
+
+## Production Ready?
+
+Yes. If tuned correctly, rusty should be performant across various CPU
+architectures and workloads. Rusty by default creates a separate scheduling
+domain per-LLC, so its default configuration may be performant as well. Note
+however that scx_rusty does not yet disambiguate between LLCs in different NUMA
+nodes, so it may perform better on multi-CCX machines where all the LLCs share
+the same socket, as opposed to multi-socket machines.
+
+Note as well that you may run into an issue with infeasible weights, where a
+task with a very high weight may cause the scheduler to incorrectly leave cores
+idle because it thinks they're necessary to accommodate the compute for a
+single task. This can also happen in CFS, and should soon be addressed for
+scx_rusty.
diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index c85e95bf372a4..fe4de979f2a2d 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -425,10 +425,13 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask))
 		goto enoent;
 
-	if (kthreads_local &&
-	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+	if (p->nr_cpus_allowed == 1) {
 		cpu = prev_cpu;
-		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
+		if (kthreads_local && (p->flags & PF_KTHREAD)) {
+			stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
+		} else {
+			stat_add(RUSTY_STAT_PINNED, 1);
+		}
 		goto direct;
 	}
 
@@ -436,7 +439,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
 	 * local dsq of the waker.
 	 */
-	if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) {
+	if (wake_flags & SCX_WAKE_SYNC) {
 		struct task_struct *current = (void *)bpf_get_current_task();
 
 		if (!(BPF_CORE_READ(current, flags) & PF_EXITING) &&
@@ -475,13 +478,6 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		}
 	}
 
-	/* If only one CPU is allowed, dispatch */
-	if (p->nr_cpus_allowed == 1) {
-		stat_add(RUSTY_STAT_PINNED, 1);
-		cpu = prev_cpu;
-		goto direct;
-	}
-
 	has_idle_cores = !bpf_cpumask_empty(idle_smtmask);
 
 	/* did @p get pulled out to a foreign domain by e.g. greedy execution? */
@@ -956,8 +952,8 @@ void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
 			bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
-s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct bpf_cpumask *cpumask;
 	struct task_ctx taskc = { .dom_active_pids_gen = -1 };
@@ -1006,7 +1002,8 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 	return 0;
 }
 
-void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p)
+void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
 {
 	pid_t pid = p->pid;
 	long ret;
@@ -1159,8 +1156,8 @@ struct sched_ext_ops rusty = {
 	.quiescent		= (void *)rusty_quiescent,
 	.set_weight		= (void *)rusty_set_weight,
 	.set_cpumask		= (void *)rusty_set_cpumask,
-	.prep_enable		= (void *)rusty_prep_enable,
-	.disable		= (void *)rusty_disable,
+	.init_task		= (void *)rusty_init_task,
+	.exit_task		= (void *)rusty_exit_task,
 	.init			= (void *)rusty_init,
 	.exit			= (void *)rusty_exit,
 	.name			= "rusty",
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index ff7cc9d80a7ea..3192ee049f9f2 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -187,6 +187,15 @@ fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
         .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
 }
 
+fn sub_or_zero(curr: &u64, prev: &u64) -> u64
+{
+    if let Some(res) = curr.checked_sub(*prev) {
+        res
+    } else {
+        0
+    }
+}
+
 fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
     match (curr, prev) {
         (
@@ -213,14 +222,14 @@ fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
                 ..
             },
         ) => {
-            let idle_usec = curr_idle - prev_idle;
-            let iowait_usec = curr_iowait - prev_iowait;
-            let user_usec = curr_user - prev_user;
-            let system_usec = curr_system - prev_system;
-            let nice_usec = curr_nice - prev_nice;
-            let irq_usec = curr_irq - prev_irq;
-            let softirq_usec = curr_softirq - prev_softirq;
-            let stolen_usec = curr_stolen - prev_stolen;
+            let idle_usec = sub_or_zero(curr_idle, prev_idle);
+            let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
+            let user_usec = sub_or_zero(curr_user, prev_user);
+            let system_usec = sub_or_zero(curr_system, prev_system);
+            let nice_usec = sub_or_zero(curr_nice, prev_nice);
+            let irq_usec = sub_or_zero(curr_irq, prev_irq);
+            let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
+            let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
 
             let busy_usec =
                 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
@@ -426,7 +435,7 @@ impl Tuner {
             .read_stat()?
             .cpus_map
             .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
-        let ti = &mut skel.bss().tune_input;
+        let ti = &mut skel.bss_mut().tune_input;
         let mut dom_nr_cpus = vec![0; self.top.nr_doms];
         let mut dom_util_sum = vec![0.0; self.top.nr_doms];
 
@@ -620,7 +629,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         // XXX - We can't read task_ctx inline because self.skel.bss()
         // borrows mutably and thus conflicts with self.skel.maps().
         const MAX_PIDS: u64 = bpf_intf::consts_MAX_DOM_ACTIVE_PIDS as u64;
-        let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize];
+        let active_pids = &mut self.skel.bss_mut().dom_active_pids[dom as usize];
         let mut pids = vec![];
 
         let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx);
@@ -901,16 +910,16 @@ impl<'a> Scheduler<'a> {
             Topology::from_cache_level(opts.cache_level, nr_cpus)?
         });
 
-        skel.rodata().nr_doms = top.nr_doms as u32;
-        skel.rodata().nr_cpus = top.nr_cpus as u32;
+        skel.rodata_mut().nr_doms = top.nr_doms as u32;
+        skel.rodata_mut().nr_cpus = top.nr_cpus as u32;
 
         for (cpu, dom) in top.cpu_dom.iter().enumerate() {
-            skel.rodata().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32;
+            skel.rodata_mut().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32;
         }
 
         for (dom, cpus) in top.dom_cpus.iter().enumerate() {
             let raw_cpus_slice = cpus.as_raw_slice();
-            let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
+            let dom_cpumask_slice = &mut skel.rodata_mut().dom_cpumasks[dom];
             let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len());
             left.clone_from_slice(cpus.as_raw_slice());
             info!(
@@ -921,13 +930,13 @@ impl<'a> Scheduler<'a> {
             );
         }
 
-        skel.rodata().slice_ns = opts.slice_us * 1000;
-        skel.rodata().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
-        skel.rodata().kthreads_local = opts.kthreads_local;
-        skel.rodata().fifo_sched = opts.fifo_sched;
-        skel.rodata().switch_partial = opts.partial;
-        skel.rodata().greedy_threshold = opts.greedy_threshold;
-        skel.rodata().debug = opts.verbose as u32;
+        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
+        skel.rodata_mut().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
+        skel.rodata_mut().kthreads_local = opts.kthreads_local;
+        skel.rodata_mut().fifo_sched = opts.fifo_sched;
+        skel.rodata_mut().switch_partial = opts.partial;
+        skel.rodata_mut().greedy_threshold = opts.greedy_threshold;
+        skel.rodata_mut().debug = opts.verbose as u32;
 
         // Attach.
         let mut skel = skel.load().context("Failed to load BPF program")?;
@@ -994,14 +1003,14 @@ impl<'a> Scheduler<'a> {
                     guest_nice_usec: _,
                 },
             ) => {
-                let idle_usec = curr_idle - prev_idle;
-                let iowait_usec = curr_iowait - prev_iowait;
-                let user_usec = curr_user - prev_user;
-                let system_usec = curr_system - prev_system;
-                let nice_usec = curr_nice - prev_nice;
-                let irq_usec = curr_irq - prev_irq;
-                let softirq_usec = curr_softirq - prev_softirq;
-                let stolen_usec = curr_stolen - prev_stolen;
+                let idle_usec = sub_or_zero(curr_idle, prev_idle);
+                let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
+                let user_usec = sub_or_zero(curr_user, prev_user);
+                let system_usec = sub_or_zero(curr_system, prev_system);
+                let nice_usec = sub_or_zero(curr_nice, prev_nice);
+                let irq_usec = sub_or_zero(curr_irq, prev_irq);
+                let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
+                let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
 
                 let busy_usec =
                     user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 7485acbc4f509..95035aa29b10e 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -51,19 +51,22 @@ static inline bool vtime_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
-void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
-	/*
-	 * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that
-	 * running @p on its CPU directly shouldn't affect fairness. Just queue
-	 * it on the local FIFO.
-	 */
-	if (enq_flags & SCX_ENQ_LOCAL) {
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+	if (is_idle) {
 		stat_inc(0);	/* count local queueing */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-		return;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+{
 	stat_inc(1);	/* count global queueing */
 
 	if (fifo_sched) {
@@ -120,8 +123,7 @@ void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
 	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
 }
 
-void BPF_STRUCT_OPS(simple_enable, struct task_struct *p,
-		    struct scx_enable_args *args)
+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
 {
 	p->scx.dsq_vtime = vtime_now;
 }
@@ -141,6 +143,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 
 SEC(".struct_ops.link")
 struct sched_ext_ops simple_ops = {
+	.select_cpu		= (void *)simple_select_cpu,
 	.enqueue		= (void *)simple_enqueue,
 	.dispatch		= (void *)simple_dispatch,
 	.running		= (void *)simple_running,
diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
index f2791a6aecc8b..4cdc3a6fb880a 100644
--- a/tools/sched_ext/scx_userland.bpf.c
+++ b/tools/sched_ext/scx_userland.bpf.c
@@ -20,10 +20,14 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#include <string.h>
 #include <scx/common.bpf.h>
 #include "scx_userland.h"
 
+/*
+ * Maximum amount of tasks enqueued/dispatched between kernel and user-space.
+ */
+#define MAX_ENQUEUED_TASKS 4096
+
 char _license[] SEC("license") = "GPL";
 
 const volatile bool switch_partial;
@@ -35,13 +39,24 @@ const volatile u32 num_possible_cpus = 64;
 /* Stats that are printed by user space. */
 u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
 
-struct user_exit_info uei;
+/*
+ * Number of tasks that are queued for scheduling.
+ *
+ * This number is incremented by the BPF component when a task is queued to the
+ * user-space scheduler and it must be decremented by the user-space scheduler
+ * when a task is consumed.
+ */
+volatile u64 nr_queued;
 
 /*
- * Whether the user space scheduler needs to be scheduled due to a task being
- * enqueued in user space.
+ * Number of tasks that are waiting for scheduling.
+ *
+ * This number must be updated by the user-space scheduler to keep track if
+ * there is still some scheduling work to do.
  */
-static bool usersched_needed;
+volatile u64 nr_scheduled;
+
+struct user_exit_info uei;
 
 /*
  * The map containing tasks that are enqueued in user space from the kernel.
@@ -50,7 +65,7 @@ static bool usersched_needed;
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, USERLAND_MAX_TASKS);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
 	__type(value, struct scx_userland_enqueued_task);
 } enqueued SEC(".maps");
 
@@ -61,7 +76,7 @@ struct {
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, USERLAND_MAX_TASKS);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
 	__type(value, s32);
 } dispatched SEC(".maps");
 
@@ -78,6 +93,29 @@ struct {
 	__type(value, struct task_ctx);
 } task_ctx_stor SEC(".maps");
 
+/*
+ * Flag used to wake-up the user-space scheduler.
+ */
+static volatile u32 usersched_needed;
+
+/*
+ * Set user-space scheduler wake-up flag (equivalent to an atomic release
+ * operation).
+ */
+static void set_usersched_needed(void)
+{
+	__sync_fetch_and_or(&usersched_needed, 1);
+}
+
+/*
+ * Check and clear user-space scheduler wake-up flag (equivalent to an atomic
+ * acquire operation).
+ */
+static bool test_and_clear_usersched_needed(void)
+{
+	return __sync_fetch_and_and(&usersched_needed, 0) == 1;
+}
+
 static bool is_usersched_task(const struct task_struct *p)
 {
 	return p->pid == usersched_pid;
@@ -136,7 +174,6 @@ static void dispatch_user_scheduler(void)
 {
 	struct task_struct *p;
 
-	usersched_needed = false;
 	p = usersched_task();
 	if (p) {
 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
@@ -146,9 +183,8 @@ static void dispatch_user_scheduler(void)
 
 static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
 {
-	struct scx_userland_enqueued_task task;
+	struct scx_userland_enqueued_task task = {};
 
-	memset(&task, 0, sizeof(task));
 	task.pid = p->pid;
 	task.sum_exec_runtime = p->se.sum_exec_runtime;
 	task.weight = p->scx.weight;
@@ -162,7 +198,7 @@ static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 	} else {
 		__sync_fetch_and_add(&nr_user_enqueues, 1);
-		usersched_needed = true;
+		set_usersched_needed();
 	}
 }
 
@@ -191,10 +227,10 @@ void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
 
 void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
 {
-	if (usersched_needed)
+	if (test_and_clear_usersched_needed())
 		dispatch_user_scheduler();
 
-	bpf_repeat(4096) {
+	bpf_repeat(MAX_ENQUEUED_TASKS) {
 		s32 pid;
 		struct task_struct *p;
 
@@ -215,8 +251,57 @@ void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
-s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+/*
+ * A CPU is about to change its idle state. If the CPU is going idle, ensure
+ * that the user-space scheduler has a chance to run if there is any remaining
+ * work to do.
+ */
+void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle)
+{
+	/*
+	 * Don't do anything if we exit from and idle state, a CPU owner will
+	 * be assigned in .running().
+	 */
+	if (!idle)
+		return;
+	/*
+	 * A CPU is now available, notify the user-space scheduler that tasks
+	 * can be dispatched, if there is at least one task waiting to be
+	 * scheduled, either queued (accounted in nr_queued) or scheduled
+	 * (accounted in nr_scheduled).
+	 *
+	 * NOTE: nr_queued is incremented by the BPF component, more exactly in
+	 * enqueue(), when a task is sent to the user-space scheduler, then
+	 * the scheduler drains the queued tasks (updating nr_queued) and adds
+	 * them to its internal data structures / state; at this point tasks
+	 * become "scheduled" and the user-space scheduler will take care of
+	 * updating nr_scheduled accordingly; lastly tasks will be dispatched
+	 * and the user-space scheduler will update nr_scheduled again.
+	 *
+	 * Checking both counters allows to determine if there is still some
+	 * pending work to do for the scheduler: new tasks have been queued
+	 * since last check, or there are still tasks "queued" or "scheduled"
+	 * since the previous user-space scheduler run. If the counters are
+	 * both zero it is pointless to wake-up the scheduler (even if a CPU
+	 * becomes idle), because there is nothing to do.
+	 *
+	 * Keep in mind that update_idle() doesn't run concurrently with the
+	 * user-space scheduler (that is single-threaded): this function is
+	 * naturally serialized with the user-space scheduler code, therefore
+	 * this check here is also safe from a concurrency perspective.
+	 */
+	if (nr_queued || nr_scheduled) {
+		/*
+		 * Kick the CPU to make it immediately ready to accept
+		 * dispatched tasks.
+		 */
+		set_usersched_needed();
+		scx_bpf_kick_cpu(cpu, 0);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -254,9 +339,11 @@ struct sched_ext_ops userland_ops = {
 	.select_cpu		= (void *)userland_select_cpu,
 	.enqueue		= (void *)userland_enqueue,
 	.dispatch		= (void *)userland_dispatch,
-	.prep_enable		= (void *)userland_prep_enable,
+	.update_idle		= (void *)userland_update_idle,
+	.init_task		= (void *)userland_init_task,
 	.init			= (void *)userland_init,
 	.exit			= (void *)userland_exit,
+	.flags			= SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE,
 	.timeout_ms		= 3000,
 	.name			= "userland",
 };
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index fef028a1756e0..368acd0b38bd9 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -36,6 +36,8 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
+"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n"
+"\n"
 "Usage: %s [-b BATCH] [-p]\n"
 "\n"
 "  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
@@ -55,7 +57,10 @@ static struct scx_userland *skel;
 static struct bpf_link *ops_link;
 
 /* Stats collected in user space. */
-static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed;
+
+/* Number of tasks currently enqueued. */
+static __u64 nr_curr_enqueued;
 
 /* The data structure containing tasks that are enqueued in user space. */
 struct enqueued_task {
@@ -80,13 +85,15 @@ LIST_HEAD(listhead, enqueued_task);
 static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
 
 /*
- * The statically allocated array of tasks. We use a statically allocated list
- * here to avoid having to allocate on the enqueue path, which could cause a
+ * The main array of tasks. The array is allocated all at once during
+ * initialization, based on /proc/sys/kernel/pid_max, to avoid having to
+ * dynamically allocate memory on the enqueue path, which could cause a
  * deadlock. A more substantive user space scheduler could e.g. provide a hook
  * for newly enabled tasks that are passed to the scheduler from the
  * .prep_enable() callback to allows the scheduler to allocate on safe paths.
  */
-struct enqueued_task tasks[USERLAND_MAX_TASKS];
+struct enqueued_task *tasks;
+static int pid_max;
 
 static double min_vruntime;
 
@@ -95,6 +102,41 @@ static void sigint_handler(int userland)
 	exit_req = 1;
 }
 
+static int get_pid_max(void)
+{
+	FILE *fp;
+	int pid_max;
+
+	fp = fopen("/proc/sys/kernel/pid_max", "r");
+	if (fp == NULL) {
+		fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n");
+		return -1;
+	}
+	if (fscanf(fp, "%d", &pid_max) != 1) {
+		fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n");
+		fclose(fp);
+		return -1;
+	}
+	fclose(fp);
+
+	return pid_max;
+}
+
+static int init_tasks(void)
+{
+	pid_max = get_pid_max();
+	if (pid_max < 0)
+		return pid_max;
+
+	tasks = calloc(pid_max, sizeof(*tasks));
+	if (!tasks) {
+		fprintf(stderr, "Error allocating tasks array\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static __u32 task_pid(const struct enqueued_task *task)
 {
 	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
@@ -106,8 +148,7 @@ static int dispatch_task(__s32 pid)
 
 	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
 	if (err) {
-		fprintf(stderr, "Failed to dispatch task %d\n", pid);
-		exit_req = 1;
+		nr_vruntime_failed++;
 	} else {
 		nr_vruntime_dispatches++;
 	}
@@ -117,7 +158,7 @@ static int dispatch_task(__s32 pid)
 
 static struct enqueued_task *get_enqueued_task(__s32 pid)
 {
-	if (pid >= USERLAND_MAX_TASKS)
+	if (pid >= pid_max)
 		return NULL;
 
 	return &tasks[pid];
@@ -153,6 +194,7 @@ static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
 
 	update_enqueued(curr, bpf_task);
 	nr_vruntime_enqueues++;
+	nr_curr_enqueued++;
 
 	/*
 	 * Enqueue the task in a vruntime-sorted list. A more optimal data
@@ -186,8 +228,11 @@ static void drain_enqueued_map(void)
 		struct scx_userland_enqueued_task task;
 		int err;
 
-		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task))
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) {
+			skel->bss->nr_queued = 0;
+			skel->bss->nr_scheduled = nr_curr_enqueued;
 			return;
+		}
 
 		err = vruntime_enqueue(&task);
 		if (err) {
@@ -210,18 +255,24 @@ static void dispatch_batch(void)
 
 		task = LIST_FIRST(&vruntime_head);
 		if (!task)
-			return;
+			break;
 
 		min_vruntime = task->vruntime;
 		pid = task_pid(task);
 		LIST_REMOVE(task, entries);
 		err = dispatch_task(pid);
 		if (err) {
-			fprintf(stderr, "Failed to dispatch task %d in %u\n",
-				pid, i);
-			return;
+			/*
+			 * If we fail to dispatch, put the task back to the
+			 * vruntime_head list and stop dispatching additional
+			 * tasks in this batch.
+			 */
+			LIST_INSERT_HEAD(&vruntime_head, task, entries);
+			break;
 		}
+		nr_curr_enqueued--;
 	}
+	skel->bss->nr_scheduled = nr_curr_enqueued;
 }
 
 static void *run_stats_printer(void *arg)
@@ -248,8 +299,10 @@ static void *run_stats_printer(void *arg)
 		printf("|-----------------------|\n");
 		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
 		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		printf("|  failed:   %10llu |\n", nr_vruntime_failed);
 		printf("o-----------------------o\n");
 		printf("\n\n");
+		fflush(stdout);
 		sleep(1);
 	}
 
@@ -272,6 +325,10 @@ static void bootstrap(int argc, char **argv)
 	};
 	bool switch_partial = false;
 
+	err = init_tasks();
+	if (err)
+		exit(err);
+
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h
index 639c6809c5ffe..684fb2dd5de96 100644
--- a/tools/sched_ext/scx_userland.h
+++ b/tools/sched_ext/scx_userland.h
@@ -4,8 +4,6 @@
 #ifndef __SCX_USERLAND_COMMON_H
 #define __SCX_USERLAND_COMMON_H
 
-#define USERLAND_MAX_TASKS 8192
-
 /*
  * An instance of a task that has been enqueued by the kernel for consumption
  * by a user space global scheduler thread.

From 9ad553514c21c09c0a9147672ca66306fd41fced Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 9 Jan 2024 14:24:37 -0600
Subject: [PATCH 251/304] scx: Fix direct dispatch for non-builtin DSQs

If we've done a direct dispatch from ops.select_cpu(), we can currently
hang the host if we dispatch to a non-local DSQ. This is because we
circumvent some important checks, such as whether we should be bypassing
ops.enqueue() and dispatching directly to the local or global DSQ.

Doing a local dispatch now doesn't hang the host because we happen to be
dispatching to a safe, builtin DSQ. Let's instead update the logic to
only do the direct dispatch after these critical checks.

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2a552efeec6f3..c10a2597a0f21 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -681,6 +681,13 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	dsq->nr++;
 	p->scx.dsq = dsq;
 
+	/*
+	 * scx.ddsq_id is only relevant on the direct dispatch path, but we
+	 * clear it here because the direct dispatch verdict may be overridden
+	 * on the enqueue path during e.g. bypass.
+	 */
+	p->scx.ddsq_id = SCX_DSQ_INVALID;
+
 	/*
 	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
 	 * match waiters' load_acquire.
@@ -854,7 +861,6 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
 
 	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsq_id, p);
 	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-	p->scx.ddsq_id = SCX_DSQ_INVALID;
 }
 
 static bool test_rq_online(struct rq *rq)
@@ -874,9 +880,6 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 
 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
 
-	if (p->scx.ddsq_id != SCX_DSQ_INVALID)
-		goto direct;
-
 	/* rq migration */
 	if (sticky_cpu == cpu_of(rq))
 		goto local_norefill;
@@ -896,6 +899,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 			goto global;
 	}
 
+	if (p->scx.ddsq_id != SCX_DSQ_INVALID)
+		goto direct;
+
 	/* see %SCX_OPS_ENQ_EXITING */
 	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
 	    unlikely(p->flags & PF_EXITING))

From 4b56f6e5a156d8be0703bd060317103308300b48 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 8 Jan 2024 18:01:25 -0600
Subject: [PATCH 252/304] scx: Keep track of enq flags in direct dispatch

We're currently not remembering enq flags during direct dispatch. Let's
record them in case someone wants to pass e.g. SCX_ENQ_PREEMPT from
ops.select_cpu().

Let's also reset ddsq_id and ddsq_enq_flags before calling
dispatch_enqueue() to ensure there's no races with the task being
consumed from another core.

Signed-off-by: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h |  8 +++---
 init/init_task.c          |  3 ++-
 kernel/sched/core.c       |  3 ++-
 kernel/sched/ext.c        | 54 +++++++++++++++++++++------------------
 4 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 5f457194e3a8c..e629686cf0621 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -612,9 +612,8 @@ struct scx_dispatch_q {
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
-	SCX_TASK_DDSP_PRIQ	= 1 << 2, /* task should be enqueued on priq when directly dispatched */
-	SCX_TASK_RESET_RUNNABLE_AT = 1 << 3, /* runnable_at should be reset */
-	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 4, /* last dequeue was for SLEEP */
+	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
+	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 
 	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
 	SCX_TASK_STATE_BITS	= 2,
@@ -689,7 +688,8 @@ struct sched_ext_entity {
 #ifdef CONFIG_SCHED_CORE
 	u64			core_sched_at;	/* see scx_prio_less() */
 #endif
-	u64			ddsq_id;
+	u64			ddsp_dsq_id;
+	u64			ddsp_enq_flags;
 
 	/* BPF scheduler modifiable fields */
 
diff --git a/init/init_task.c b/init/init_task.c
index 1e035992a52b9..54c9244ef9e5d 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -114,7 +114,8 @@ struct task_struct init_task
 		.ops_state	= ATOMIC_INIT(0),
 		.runnable_at	= INITIAL_JIFFIES,
 		.slice		= SCX_SLICE_DFL,
-		.ddsq_id	= SCX_DSQ_INVALID,
+		.ddsp_dsq_id	= SCX_DSQ_INVALID,
+		.ddsp_enq_flags	= 0,
 	},
 #endif
 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 781e8a00b6d59..937ef9353c0b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4564,7 +4564,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	atomic_long_set(&p->scx.ops_state, 0);
 	p->scx.runnable_at	= INITIAL_JIFFIES;
 	p->scx.slice		= SCX_SLICE_DFL;
-	p->scx.ddsq_id		= SCX_DSQ_INVALID;
+	p->scx.ddsp_dsq_id	= SCX_DSQ_INVALID;
+	p->scx.ddsp_enq_flags	= 0;
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c10a2597a0f21..dd09b53254f59 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -682,11 +682,13 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 	p->scx.dsq = dsq;
 
 	/*
-	 * scx.ddsq_id is only relevant on the direct dispatch path, but we
-	 * clear it here because the direct dispatch verdict may be overridden
-	 * on the enqueue path during e.g. bypass.
+	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
+	 * direct dispatch path, but we clear them here because the direct
+	 * dispatch verdict may be overridden on the enqueue path during e.g.
+	 * bypass.
 	 */
-	p->scx.ddsq_id = SCX_DSQ_INVALID;
+	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
+	p->scx.ddsp_enq_flags = 0;
 
 	/*
 	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
@@ -840,12 +842,11 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
 		return;
 	}
 
-	WARN_ON_ONCE(p->scx.ddsq_id != SCX_DSQ_INVALID);
-	WARN_ON_ONCE(p->scx.flags & SCX_TASK_DDSP_PRIQ);
+	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
+	WARN_ON_ONCE(p->scx.ddsp_enq_flags);
 
-	p->scx.ddsq_id = dsq_id;
-	if (enq_flags & SCX_ENQ_DSQ_PRIQ)
-		p->scx.flags |= SCX_TASK_DDSP_PRIQ;
+	p->scx.ddsp_dsq_id = dsq_id;
+	p->scx.ddsp_enq_flags = enq_flags;
 }
 
 static void direct_dispatch(struct task_struct *p, u64 enq_flags)
@@ -854,13 +855,9 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
 
 	touch_core_sched_dispatch(task_rq(p), p);
 
-	if (p->scx.flags & SCX_TASK_DDSP_PRIQ) {
-		enq_flags |= SCX_ENQ_DSQ_PRIQ;
-		p->scx.flags &= ~SCX_TASK_DDSP_PRIQ;
-	}
-
-	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsq_id, p);
-	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+	enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
+	dispatch_enqueue(dsq, p, enq_flags);
 }
 
 static bool test_rq_online(struct rq *rq)
@@ -899,7 +896,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 			goto global;
 	}
 
-	if (p->scx.ddsq_id != SCX_DSQ_INVALID)
+	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
 		goto direct;
 
 	/* see %SCX_OPS_ENQ_EXITING */
@@ -928,7 +925,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
 
 	*ddsp_taskp = NULL;
-	if (p->scx.ddsq_id != SCX_DSQ_INVALID)
+	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
 		goto direct;
 
 	/*
@@ -2148,7 +2145,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
 		if (found) {
 			p->scx.slice = SCX_SLICE_DFL;
-			p->scx.ddsq_id = SCX_DSQ_LOCAL;
+			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
 		}
 		return cpu;
 	}
@@ -4107,13 +4104,20 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
  * @enq_flags: SCX_ENQ_*
  *
  * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
- * to call this function spuriously. Can be called from ops.enqueue() and
- * ops.dispatch().
+ * to call this function spuriously. Can be called from ops.enqueue(),
+ * ops.select_cpu(), and ops.dispatch().
+ *
+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
+ * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
+ * used to target the local DSQ of a CPU other than the enqueueing one. Use
+ * ops.select_cpu() to be on the target CPU in the first place.
  *
- * When called from ops.enqueue(), it's for direct dispatch and @p must match
- * the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be used to target the
- * local DSQ of a CPU other than the enqueueing one. Use ops.select_cpu() to be
- * on the target CPU in the first place.
+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
+ * will be directly dispatched to the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
+ * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
+ * task is dispatched.
  *
  * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
  * and this function can be called upto ops.dispatch_max_batch times to dispatch

From 59ad5bda56bfedc2a026980868ddd790efe1ae63 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 9 Jan 2024 09:45:59 -0600
Subject: [PATCH 253/304] scx: Test vtime dispatching from ops.select_cpu()

Let's test that we properly stash enq flags by doing vtime dispatching
from ops.select_cpu().

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/.gitignore        |  1 +
 tools/testing/selftests/scx/Makefile          |  3 +-
 .../selftests/scx/select_cpu_vtime.bpf.c      | 94 +++++++++++++++++++
 .../testing/selftests/scx/select_cpu_vtime.c  | 39 ++++++++
 4 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/scx/select_cpu_vtime.bpf.c
 create mode 100644 tools/testing/selftests/scx/select_cpu_vtime.c

diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore
index 991721c50d9ee..4ae433bb3955d 100644
--- a/tools/testing/selftests/scx/.gitignore
+++ b/tools/testing/selftests/scx/.gitignore
@@ -9,4 +9,5 @@ select_cpu_dfl_nodispatch
 select_cpu_dispatch
 select_cpu_dispatch_dbl_dsp
 select_cpu_dispatch_bad_dsq
+select_cpu_vtime
 build/
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index ae713d614f252..8a0b66236ada4 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -158,7 +158,8 @@ c-sched-targets :=			\
 	select_cpu_dfl_nodispatch	\
 	select_cpu_dispatch		\
 	select_cpu_dispatch_bad_dsq	\
-	select_cpu_dispatch_dbl_dsp
+	select_cpu_dispatch_dbl_dsp	\
+	select_cpu_vtime
 
 $(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
 	$(eval sched=$(notdir $@))
diff --git a/tools/testing/selftests/scx/select_cpu_vtime.bpf.c b/tools/testing/selftests/scx/select_cpu_vtime.bpf.c
new file mode 100644
index 0000000000000..b8bdadf3e541b
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_vtime.bpf.c
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A scheduler that validates that enqueue flags are properly stored and
+ * applied at dispatch time when a task is directly dispatched from
+ * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and
+ * making the test a very basic vtime scheduler.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+volatile bool consumed;
+
+static u64 vtime_now;
+
+#define VTIME_DSQ 0
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static inline u64 task_vtime(const struct task_struct *p)
+{
+	u64 vtime = p->scx.dsq_vtime;
+
+	if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+		return vtime_now - SCX_SLICE_DFL;
+	else
+		return vtime;
+}
+
+s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	s32 cpu;
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0)
+		goto ddsp;
+
+	cpu = prev_cpu;
+	scx_bpf_test_and_clear_cpu_idle(cpu);
+ddsp:
+	scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
+{
+	if (scx_bpf_consume(VTIME_DSQ))
+		consumed = true;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p)
+{
+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+		vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p,
+		    bool runnable)
+{
+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p)
+{
+	p->scx.dsq_vtime = vtime_now;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
+{
+	scx_bpf_switch_all();
+
+	return scx_bpf_create_dsq(VTIME_DSQ, -1);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops select_cpu_vtime_ops = {
+	.select_cpu		= select_cpu_vtime_select_cpu,
+	.dispatch		= select_cpu_vtime_dispatch,
+	.running		= select_cpu_vtime_running,
+	.stopping		= select_cpu_vtime_stopping,
+	.enable			= select_cpu_vtime_enable,
+	.init			= select_cpu_vtime_init,
+	.name			= "select_cpu_vtime",
+	.timeout_ms		= 1000U,
+};
diff --git a/tools/testing/selftests/scx/select_cpu_vtime.c b/tools/testing/selftests/scx/select_cpu_vtime.c
new file mode 100644
index 0000000000000..6f72f0625478c
--- /dev/null
+++ b/tools/testing/selftests/scx/select_cpu_vtime.c
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include "select_cpu_vtime.bpf.skel.h"
+#include "scx_test.h"
+
+int main(int argc, char **argv)
+{
+	struct select_cpu_vtime *skel;
+	struct bpf_link *link;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = select_cpu_vtime__open_and_load();
+	SCX_BUG_ON(!skel, "Failed to open and load skel");
+
+	SCX_ASSERT(!skel->bss->consumed);
+
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops);
+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+
+	SCX_ASSERT(skel->bss->consumed);
+
+	bpf_link__destroy(link);
+	select_cpu_vtime__destroy(skel);
+
+	return 0;
+}

From c64a804d113a1442e9a94cc9ed3244386751b7cb Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 5 Jan 2024 20:36:26 -0600
Subject: [PATCH 254/304] scx: Implement scx selftests framework

We want to make it as easy as possible both to run tests, and to
implement them. This means we ideally want a single test runner binary
that can run the testcases, while also making it trivial to add a
testcase without worrying about having to update the runner itself.

To accomplish this, this patch adds a new declarative mechanism for
defining scx tests by implementing a struct scx_test object. Tests can
simply define such a struct, and then register it with the testrunner
using a REGISTER_SCX_TEST macro. The build system will automatically
compile the testcase and add machinery to have it be auto-registered
into the runner binary. The runner binary then outputs test results in
ktap [0] format so it can be consumed by CI systems.

[0]: https://docs.kernel.org/dev-tools/ktap.html

This patch simply implements the framework, adds a test_example.c file
that illustrates how to add a testcase, and converts a few existing
testcases to use the framework. If the framework is acceptable, we can
convert the rest.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/.gitignore        |   7 +-
 tools/testing/selftests/scx/Makefile          |  40 +++-
 ...q_fail.bpf.c => ddsp_bogus_dsq_fail.bpf.c} |  12 +-
 .../selftests/scx/ddsp_bogus_dsq_fail.c       |  55 ++++++
 ..._fail.bpf.c => ddsp_vtimelocal_fail.bpf.c} |  12 +-
 .../selftests/scx/ddsp_vtimelocal_fail.c      |  54 +++++
 .../selftests/scx/dsp_fallbackdsq_fail.c      |  36 ----
 .../testing/selftests/scx/dsp_localdsq_fail.c |  36 ----
 .../selftests/scx/enq_last_no_enq_fails.c     |  50 +++--
 ...fails.bpf.c => enq_select_cpu_fails.bpf.c} |  19 +-
 .../selftests/scx/enq_select_cpu_fails.c      |  61 ++++++
 .../selftests/scx/enqueue_select_cpu_fails.c  |  36 ----
 .../testing/selftests/scx/init_enable_count.c |  46 +++--
 tools/testing/selftests/scx/minimal.c         |  64 +++---
 tools/testing/selftests/scx/runner.c          | 187 ++++++++++++++++++
 tools/testing/selftests/scx/scx_test.h        | 108 ++++++++--
 tools/testing/selftests/scx/test_example.c    |  49 +++++
 17 files changed, 666 insertions(+), 206 deletions(-)
 rename tools/testing/selftests/scx/{dsp_fallbackdsq_fail.bpf.c => ddsp_bogus_dsq_fail.bpf.c} (70%)
 create mode 100644 tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
 rename tools/testing/selftests/scx/{dsp_localdsq_fail.bpf.c => ddsp_vtimelocal_fail.bpf.c} (69%)
 create mode 100644 tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
 delete mode 100644 tools/testing/selftests/scx/dsp_fallbackdsq_fail.c
 delete mode 100644 tools/testing/selftests/scx/dsp_localdsq_fail.c
 rename tools/testing/selftests/scx/{enqueue_select_cpu_fails.bpf.c => enq_select_cpu_fails.bpf.c} (63%)
 create mode 100644 tools/testing/selftests/scx/enq_select_cpu_fails.c
 delete mode 100644 tools/testing/selftests/scx/enqueue_select_cpu_fails.c
 create mode 100644 tools/testing/selftests/scx/runner.c
 create mode 100644 tools/testing/selftests/scx/test_example.c

diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore
index 4ae433bb3955d..2c077082b67a5 100644
--- a/tools/testing/selftests/scx/.gitignore
+++ b/tools/testing/selftests/scx/.gitignore
@@ -1,9 +1,10 @@
-dsp_fallbackdsq_fail
-dsp_localdsq_fail
+ddsp_bogus_dsq_fail
+ddsp_vtimelocal_fail
 enq_last_no_enq_fails
-enqueue_select_cpu_fails
+enq_select_cpu_fails
 init_enable_count
 minimal
+runner
 select_cpu_dfl
 select_cpu_dfl_nodispatch
 select_cpu_dispatch
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index 8a0b66236ada4..c28f23c0442cc 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -68,7 +68,7 @@ ifneq ($(LLVM),)
 CFLAGS += -Wno-unused-command-line-argument
 endif
 
-LDFLAGS = -lelf -lz -lpthread
+LDFLAGS = -lelf -lz -lpthread -lzstd
 
 IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
 			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
@@ -148,12 +148,6 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 # C schedulers #
 ################
 c-sched-targets :=			\
-	dsp_fallbackdsq_fail		\
-	dsp_localdsq_fail		\
-	enq_last_no_enq_fails		\
-	enqueue_select_cpu_fails	\
-	init_enable_count		\
-	minimal				\
 	select_cpu_dfl			\
 	select_cpu_dfl_nodispatch	\
 	select_cpu_dispatch		\
@@ -172,9 +166,39 @@ override define CLEAN
 	rm -rf $(OUTPUT_DIR)
 	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
 	rm -f $(TEST_GEN_PROGS)
+	rm -f runner
 endef
 
-all: $(TEST_GEN_PROGS)
+auto-test-targets :=			\
+	enq_last_no_enq_fails		\
+	enq_select_cpu_fails		\
+	ddsp_bogus_dsq_fail		\
+	ddsp_vtimelocal_fail		\
+	init_enable_count		\
+	minimal				\
+	test_example
+
+testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
+
+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Create all of the test targets object files, whose testcase objects will be
+# registered into the runner in ELF constructors.
+#
+# Note that we must do double expansion here in order to support conditionally
+# compiling BPF object files only if one is present, as the wildcard Make
+# function doesn't support using implicit rules otherwise.
+.SECONDEXPANSION:
+$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $$(if $$(wildcard $$*.bpf.c), $(INCLUDE_DIR)/%.bpf.skel.h) | $(SCXOBJ_DIR)
+	$(eval test=$(patsubst %.o,%.c,$(notdir $@)))
+	$(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o
+
+runner: $(SCXOBJ_DIR)/runner.o $(BPFOBJ) $(testcase-targets)
+	@echo "$(testcase-targets)"
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^
+
+all: runner
 
 .PHONY: all clean help
 
diff --git a/tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c
similarity index 70%
rename from tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c
rename to tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c
index d15ad9b0b2c35..78bd8feace050 100644
--- a/tools/testing/selftests/scx/dsp_fallbackdsq_fail.bpf.c
+++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c
@@ -8,7 +8,7 @@
 
 char _license[] SEC("license") = "GPL";
 
-s32 BPF_STRUCT_OPS(dsp_fallbackdsq_fail_select_cpu, struct task_struct *p,
+s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
 	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
@@ -26,7 +26,7 @@ s32 BPF_STRUCT_OPS(dsp_fallbackdsq_fail_select_cpu, struct task_struct *p,
 	return prev_cpu;
 }
 
-s32 BPF_STRUCT_OPS(dsp_fallbackdsq_fail_init)
+s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_init)
 {
 	scx_bpf_switch_all();
 
@@ -34,9 +34,9 @@ s32 BPF_STRUCT_OPS(dsp_fallbackdsq_fail_init)
 }
 
 SEC(".struct_ops.link")
-struct sched_ext_ops dsp_fallbackdsq_fail_ops = {
-	.select_cpu		= dsp_fallbackdsq_fail_select_cpu,
-	.init			= dsp_fallbackdsq_fail_init,
-	.name			= "dsp_fallbackdsq_fail",
+struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
+	.select_cpu		= ddsp_bogus_dsq_fail_select_cpu,
+	.init			= ddsp_bogus_dsq_fail_init,
+	.name			= "ddsp_bogus_dsq_fail",
 	.timeout_ms		= 1000U,
 };
diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
new file mode 100644
index 0000000000000..b12e804a0b664
--- /dev/null
+++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "ddsp_bogus_dsq_fail.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct ddsp_bogus_dsq_fail *skel;
+
+	skel = ddsp_bogus_dsq_fail__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct ddsp_bogus_dsq_fail *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct ddsp_bogus_dsq_fail *skel = ctx;
+
+	ddsp_bogus_dsq_fail__destroy(skel);
+}
+
+struct scx_test ddsp_bogus_dsq_fail = {
+	.name = "ddsp_bogus_dsq_fail",
+	.description = "Verify we gracefully fail, and fall back to using a "
+		       "built-in DSQ, if we do a direct dispatch to an invalid"
+		       " DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail)
diff --git a/tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c
similarity index 69%
rename from tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c
rename to tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c
index e27a95a8726be..14a40be192c5d 100644
--- a/tools/testing/selftests/scx/dsp_localdsq_fail.bpf.c
+++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c
@@ -8,7 +8,7 @@
 
 char _license[] SEC("license") = "GPL";
 
-s32 BPF_STRUCT_OPS(dsp_localdsq_fail_select_cpu, struct task_struct *p,
+s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
 	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
@@ -23,7 +23,7 @@ s32 BPF_STRUCT_OPS(dsp_localdsq_fail_select_cpu, struct task_struct *p,
 	return prev_cpu;
 }
 
-s32 BPF_STRUCT_OPS(dsp_localdsq_fail_init)
+s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_init)
 {
 	scx_bpf_switch_all();
 
@@ -31,9 +31,9 @@ s32 BPF_STRUCT_OPS(dsp_localdsq_fail_init)
 }
 
 SEC(".struct_ops.link")
-struct sched_ext_ops dsp_localdsq_fail_ops = {
-	.select_cpu		= dsp_localdsq_fail_select_cpu,
-	.init			= dsp_localdsq_fail_init,
-	.name			= "dsp_localdsq_fail",
+struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
+	.select_cpu		= ddsp_vtimelocal_fail_select_cpu,
+	.init			= ddsp_vtimelocal_fail_init,
+	.name			= "ddsp_vtimelocal_fail",
 	.timeout_ms		= 1000U,
 };
diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
new file mode 100644
index 0000000000000..03bbadfaaf2f1
--- /dev/null
+++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <unistd.h>
+#include "ddsp_vtimelocal_fail.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct ddsp_vtimelocal_fail *skel;
+
+	skel = ddsp_vtimelocal_fail__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct ddsp_vtimelocal_fail *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
+
+	sleep(1);
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct ddsp_vtimelocal_fail *skel = ctx;
+
+	ddsp_vtimelocal_fail__destroy(skel);
+}
+
+struct scx_test ddsp_vtimelocal_fail = {
+	.name = "ddsp_vtimelocal_fail",
+	.description = "Verify we gracefully fail, and fall back to using a "
+		       "built-in DSQ, if we do a direct vtime dispatch to a "
+		       "built-in DSQ from DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&ddsp_vtimelocal_fail)
diff --git a/tools/testing/selftests/scx/dsp_fallbackdsq_fail.c b/tools/testing/selftests/scx/dsp_fallbackdsq_fail.c
deleted file mode 100644
index fd70cd89d9d06..0000000000000
--- a/tools/testing/selftests/scx/dsp_fallbackdsq_fail.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2024 David Vernet <dvernet@meta.com>
- * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
- */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include <sys/wait.h>
-#include "dsp_fallbackdsq_fail.bpf.skel.h"
-#include "scx_test.h"
-
-int main(int argc, char **argv)
-{
-	struct dsp_fallbackdsq_fail *skel;
-	struct bpf_link *link;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = dsp_fallbackdsq_fail__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
-
-	link = bpf_map__attach_struct_ops(skel->maps.dsp_fallbackdsq_fail_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-
-	sleep(1);
-
-	bpf_link__destroy(link);
-	dsp_fallbackdsq_fail__destroy(skel);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/scx/dsp_localdsq_fail.c b/tools/testing/selftests/scx/dsp_localdsq_fail.c
deleted file mode 100644
index 4840386ba7643..0000000000000
--- a/tools/testing/selftests/scx/dsp_localdsq_fail.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2024 David Vernet <dvernet@meta.com>
- * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
- */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include <sys/wait.h>
-#include "dsp_localdsq_fail.bpf.skel.h"
-#include "scx_test.h"
-
-int main(int argc, char **argv)
-{
-	struct dsp_localdsq_fail *skel;
-	struct bpf_link *link;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = dsp_localdsq_fail__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
-
-	link = bpf_map__attach_struct_ops(skel->maps.dsp_localdsq_fail_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-
-	sleep(1);
-
-	bpf_link__destroy(link);
-	dsp_localdsq_fail__destroy(skel);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.c
index 1f3d4d8adcc7f..2a3eda5e2c0b4 100644
--- a/tools/testing/selftests/scx/enq_last_no_enq_fails.c
+++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.c
@@ -4,31 +4,57 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "enq_last_no_enq_fails.bpf.skel.h"
 #include "scx_test.h"
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct enq_last_no_enq_fails *skel;
-	struct bpf_link *link;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = enq_last_no_enq_fails__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct enq_last_no_enq_fails *skel = ctx;
+	struct bpf_link *link;
 
 	link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
-	SCX_BUG_ON(link, "Succeeded in attaching struct_ops");
+	if (link) {
+		SCX_ERR("Incorrectly succeeded in to attaching scheduler");
+		return SCX_TEST_FAIL;
+	}
 
 	bpf_link__destroy(link);
-	enq_last_no_enq_fails__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct enq_last_no_enq_fails *skel = ctx;
+
+	enq_last_no_enq_fails__destroy(skel);
 }
+
+struct scx_test enq_last_no_enq_fails = {
+	.name = "enq_last_no_enq_fails",
+	.description = "Verify we fail to load a scheduler if we specify "
+		       "the SCX_OPS_ENQ_LAST flag without defining "
+		       "ops.enqueue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&enq_last_no_enq_fails)
diff --git a/tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c b/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c
similarity index 63%
rename from tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c
rename to tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c
index 61f04fa4ce2b3..40ea393b2bbc9 100644
--- a/tools/testing/selftests/scx/enqueue_select_cpu_fails.bpf.c
+++ b/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c
@@ -1,8 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * A scheduler that validates the behavior of direct dispatching with a default
- * select_cpu implementation.
- *
  * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
@@ -16,13 +13,13 @@ char _license[] SEC("license") = "GPL";
 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 			   bool *found) __ksym;
 
-s32 BPF_STRUCT_OPS(enqueue_select_cpu_fails_select_cpu, struct task_struct *p,
+s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
 	return prev_cpu;
 }
 
-void BPF_STRUCT_OPS(enqueue_select_cpu_fails_enqueue, struct task_struct *p,
+void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
 		    u64 enq_flags)
 {
 	/*
@@ -37,7 +34,7 @@ void BPF_STRUCT_OPS(enqueue_select_cpu_fails_enqueue, struct task_struct *p,
 	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
-s32 BPF_STRUCT_OPS(enqueue_select_cpu_fails_init)
+s32 BPF_STRUCT_OPS(enq_select_cpu_fails_init)
 {
 	scx_bpf_switch_all();
 
@@ -45,10 +42,10 @@ s32 BPF_STRUCT_OPS(enqueue_select_cpu_fails_init)
 }
 
 SEC(".struct_ops.link")
-struct sched_ext_ops enqueue_select_cpu_fails_ops = {
-	.select_cpu		= enqueue_select_cpu_fails_select_cpu,
-	.enqueue		= enqueue_select_cpu_fails_enqueue,
-	.init			= enqueue_select_cpu_fails_init,
-	.name			= "enqueue_select_cpu_fails",
+struct sched_ext_ops enq_select_cpu_fails_ops = {
+	.select_cpu		= enq_select_cpu_fails_select_cpu,
+	.enqueue		= enq_select_cpu_fails_enqueue,
+	.init			= enq_select_cpu_fails_init,
+	.name			= "enq_select_cpu_fails",
 	.timeout_ms		= 1000U,
 };
diff --git a/tools/testing/selftests/scx/enq_select_cpu_fails.c b/tools/testing/selftests/scx/enq_select_cpu_fails.c
new file mode 100644
index 0000000000000..dd1350e5f002d
--- /dev/null
+++ b/tools/testing/selftests/scx/enq_select_cpu_fails.c
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "enq_select_cpu_fails.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct enq_select_cpu_fails *skel;
+
+	skel = enq_select_cpu_fails__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct enq_select_cpu_fails *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	sleep(1);
+
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct enq_select_cpu_fails *skel = ctx;
+
+	enq_select_cpu_fails__destroy(skel);
+}
+
+struct scx_test enq_select_cpu_fails = {
+	.name = "enq_select_cpu_fails",
+	.description = "Verify we fail to call scx_bpf_select_cpu_dfl() "
+		       "from ops.enqueue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&enq_select_cpu_fails)
diff --git a/tools/testing/selftests/scx/enqueue_select_cpu_fails.c b/tools/testing/selftests/scx/enqueue_select_cpu_fails.c
deleted file mode 100644
index f45740370f508..0000000000000
--- a/tools/testing/selftests/scx/enqueue_select_cpu_fails.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2023 David Vernet <dvernet@meta.com>
- * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
- */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include <sys/wait.h>
-#include "enqueue_select_cpu_fails.bpf.skel.h"
-#include "scx_test.h"
-
-int main(int argc, char **argv)
-{
-	struct enqueue_select_cpu_fails *skel;
-	struct bpf_link *link;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = enqueue_select_cpu_fails__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
-
-	link = bpf_map__attach_struct_ops(skel->maps.enqueue_select_cpu_fails_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-
-	sleep(1);
-
-	bpf_link__destroy(link);
-	enqueue_select_cpu_fails__destroy(skel);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/scx/init_enable_count.c b/tools/testing/selftests/scx/init_enable_count.c
index 413bf065646eb..671e3366e67d2 100644
--- a/tools/testing/selftests/scx/init_enable_count.c
+++ b/tools/testing/selftests/scx/init_enable_count.c
@@ -31,7 +31,7 @@ open_load_prog(bool global)
 	return skel;
 }
 
-static void run_test(bool global)
+static enum scx_test_status run_test(bool global)
 {
 	struct init_enable_count *skel;
 	struct bpf_link *link;
@@ -42,12 +42,12 @@ static void run_test(bool global)
 
 	skel = open_load_prog(global);
 	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
 
 	/* SCHED_EXT children */
 	for (i = 0; i < num_children; i++) {
 		pids[i] = fork();
-		SCX_BUG_ON(pids[i] < 0, "Failed to fork child");
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
 
 		if (pids[i] == 0) {
 			ret = sched_setscheduler(0, SCHED_EXT, &param);
@@ -67,10 +67,11 @@ static void run_test(bool global)
 		}
 	}
 	for (i = 0; i < num_children; i++) {
-		SCX_BUG_ON(waitpid(pids[i], &status, 0) != pids[i],
-			   "Failed to wait for SCX child");
-		SCX_BUG_ON(status != 0, "SCX child %d exited with status %d",
-			   i, status);
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for SCX child\n");
+
+		SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i,
+			    status);
 	}
 
 	/* SCHED_OTHER children */
@@ -79,11 +80,13 @@ static void run_test(bool global)
 		if (pids[i] == 0)
 			exit(0);
 	}
+
 	for (i = 0; i < num_children; i++) {
-		SCX_BUG_ON(waitpid(pids[i], &status, 0) != pids[i],
-			   "Failed to wait for normal child");
-		SCX_BUG_ON(status != 0,
-			   "Normal child %d exited with status %d", i, status);
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for normal child\n");
+
+		SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i,
+			    status);
 	}
 
 	sleep(1);
@@ -101,14 +104,25 @@ static void run_test(bool global)
 
 	bpf_link__destroy(link);
 	init_enable_count__destroy(skel);
+
+	return SCX_TEST_PASS;
 }
 
-int main(int argc, char **argv)
+static enum scx_test_status run(void *ctx)
 {
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	enum scx_test_status status;
 
-	run_test(true);
-	run_test(false);
+	status = run_test(true);
+	if (status != SCX_TEST_PASS)
+		return status;
 
-	return 0;
+	return run_test(false);
 }
+
+struct scx_test init_enable_count = {
+	.name = "init_enable_count",
+	.description = "Verify we do the correct amount of counting of init, "
+		       "enable, etc callbacks.",
+	.run = run,
+};
+REGISTER_SCX_TEST(&init_enable_count)
diff --git a/tools/testing/selftests/scx/minimal.c b/tools/testing/selftests/scx/minimal.c
index 722f0d5023994..6c5db8ebbf8ac 100644
--- a/tools/testing/selftests/scx/minimal.c
+++ b/tools/testing/selftests/scx/minimal.c
@@ -1,42 +1,58 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
- * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #include "minimal.bpf.skel.h"
+#include "scx_test.h"
 
-static volatile int exit_req;
-
-static void sigint_handler(int simple)
+static enum scx_test_status setup(void **ctx)
 {
-	exit_req = 1;
+	struct minimal *skel;
+
+	skel = minimal__open_and_load();
+	if (!skel) {
+		SCX_ERR("Failed to open and load skel");
+		return SCX_TEST_FAIL;
+	}
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
 }
 
-int main(int argc, char **argv)
+static enum scx_test_status run(void *ctx)
 {
-	struct minimal *skel;
+	struct minimal *skel = ctx;
 	struct bpf_link *link;
 
-	signal(SIGINT, sigint_handler);
-	signal(SIGTERM, sigint_handler);
+	link = bpf_map__attach_struct_ops(skel->maps.minimal_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	bpf_link__destroy(link);
 
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	return SCX_TEST_PASS;
+}
 
-	skel = minimal__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
+static void cleanup(void *ctx)
+{
+	struct minimal *skel = ctx;
 
-	link = bpf_map__attach_struct_ops(skel->maps.minimal_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-	sleep(1);
-	bpf_link__destroy(link);
 	minimal__destroy(skel);
-
-	return 0;
 }
+
+struct scx_test minimal = {
+	.name = "minimal",
+	.description = "Verify we can load a fully minimal scheduler",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&minimal)
diff --git a/tools/testing/selftests/scx/runner.c b/tools/testing/selftests/scx/runner.c
new file mode 100644
index 0000000000000..cfb57f6a00ad5
--- /dev/null
+++ b/tools/testing/selftests/scx/runner.c
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "scx_test.h"
+
+const char help_fmt[] =
+"The runner for sched_ext tests.\n"
+"\n"
+"The runner is statically linked against all testcases, and runs them all serially.\n"
+"It's required for the testcases to be serial, as only a single host-wide sched_ext\n"
+"scheduler may be loaded at any given time."
+"\n"
+"Usage: %s [-t TEST] [-h]\n"
+"\n"
+"  -t TEST       Only run tests whose name includes this string\n"
+"  -q            Don't print the test descriptions during run\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+static bool quiet;
+
+#define MAX_SCX_TESTS 2048
+
+static struct scx_test __scx_tests[MAX_SCX_TESTS];
+static unsigned __scx_num_tests = 0;
+
+static void sigint_handler(int simple)
+{
+	exit_req = 1;
+}
+
+static void print_test_preamble(const struct scx_test *test, bool quiet)
+{
+	printf("===== START =====\n");
+	printf("TEST: %s\n", test->name);
+	if (!quiet)
+		printf("DESCRIPTION: %s\n", test->description);
+	printf("OUTPUT:\n");
+}
+
+static const char *status_to_result(enum scx_test_status status)
+{
+	switch (status) {
+	case SCX_TEST_PASS:
+	case SCX_TEST_SKIP:
+		return "ok";
+	case SCX_TEST_FAIL:
+		return "not ok";
+	}
+}
+
+static void print_test_result(const struct scx_test *test,
+			      enum scx_test_status status,
+			      unsigned int testnum)
+{
+	const char *result = status_to_result(status);
+	const char *directive = status == SCX_TEST_SKIP ? "SKIP " : "";
+
+	printf("%s %u %s # %s\n", result, testnum, test->name, directive);
+	printf("=====  END  =====\n");
+}
+
+static bool should_skip_test(const struct scx_test *test, const char * filter)
+{
+	return !strstr(test->name, filter);
+}
+
+static enum scx_test_status run_test(const struct scx_test *test)
+{
+	enum scx_test_status status;
+	void *context = NULL;
+
+	if (test->setup) {
+		status = test->setup(&context);
+		if (status != SCX_TEST_PASS)
+			return status;
+	}
+
+	status = test->run(context);
+
+	if (test->cleanup)
+		test->cleanup(context);
+
+	return status;
+}
+
+static bool test_valid(const struct scx_test *test)
+{
+	if (!test) {
+		fprintf(stderr, "NULL test detected\n");
+		return false;
+	}
+
+	if (!test->name) {
+		fprintf(stderr,
+			"Test with no name found. Must specify test name.\n");
+		return false;
+	}
+
+	if (!test->description) {
+		fprintf(stderr, "Test %s requires description.\n", test->name);
+		return false;
+	}
+
+	if (!test->run) {
+		fprintf(stderr, "Test %s has no run() callback\n", test->name);
+		return false;
+	}
+
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	const char *filter = NULL;
+	unsigned testnum = 0, i;
+	unsigned passed = 0, skipped = 0, failed = 0;
+	int opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	while ((opt = getopt(argc, argv, "qt:h")) != -1) {
+		switch (opt) {
+		case 'q':
+			quiet = true;
+			break;
+		case 't':
+			filter = optarg;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	for (i = 0; i < __scx_num_tests; i++) {
+		enum scx_test_status status;
+		struct scx_test *test = &__scx_tests[i];
+
+		print_test_preamble(test, quiet);
+
+		if (filter && should_skip_test(test, filter)) {
+			print_test_result(test, SCX_TEST_SKIP, ++testnum);
+			continue;
+		}
+
+		status = run_test(test);
+		print_test_result(test, status, ++testnum);
+		switch (status) {
+		case SCX_TEST_PASS:
+			passed++;
+			break;
+		case SCX_TEST_SKIP:
+			skipped++;
+			break;
+		case SCX_TEST_FAIL:
+			failed++;
+			break;
+		}
+	}
+	printf("\n\n=============================\n\n");
+	printf("RESULTS:\n\n");
+	printf("PASSED:  %u\n", passed);
+	printf("SKIPPED: %u\n", skipped);
+	printf("FAILED:  %u\n", failed);
+
+	return 0;
+}
+
+void scx_test_register(struct scx_test *test)
+{
+	SCX_BUG_ON(!test_valid(test), "Invalid test found");
+	SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded");
+
+	__scx_tests[__scx_num_tests++] = *test;
+}
diff --git a/tools/testing/selftests/scx/scx_test.h b/tools/testing/selftests/scx/scx_test.h
index 6a61763b19ab5..e402031c2a84e 100644
--- a/tools/testing/selftests/scx/scx_test.h
+++ b/tools/testing/selftests/scx/scx_test.h
@@ -8,19 +8,103 @@
 #ifndef __SCX_TEST_H__
 #define __SCX_TEST_H__
 
+#include <errno.h>
 #include <scx/common.h>
 
-#define SCX_GT(_x, _y) SCX_BUG_ON((_x) <= (_y), "Expected %s > %s (%lu > %lu)",		\
-				  #_x, #_y, (u64)(_x), (u64)(_y))
-#define SCX_GE(_x, _y) SCX_BUG_ON((_x) < (_y), "Expected %s >= %s (%lu >= %lu)",	\
-				  #_x, #_y, (u64)(_x), (u64)(_y))
-#define SCX_LT(_x, _y) SCX_BUG_ON((_x) >= (_y), "Expected %s < %s (%lu < %lu)",		\
-				  #_x, #_y, (u64)(_x), (u64)(_y))
-#define SCX_LE(_x, _y) SCX_BUG_ON((_x) > (_y), "Expected %s <= %s (%lu <= %lu)",	\
-				  #_x, #_y, (u64)(_x), (u64)(_y))
-#define SCX_EQ(_x, _y) SCX_BUG_ON((_x) != (_y), "Expected %s == %s (%lu == %lu)",	\
-				  #_x, #_y, (u64)(_x), (u64)(_y))
-#define SCX_ASSERT(_x) SCX_BUG_ON(!(_x), "Expected %s to be true (%lu)",		\
-				  #_x, (u64)(_x))
+enum scx_test_status {
+	SCX_TEST_PASS = 0,
+	SCX_TEST_SKIP,
+	SCX_TEST_FAIL,
+};
+
+struct scx_test {
+	/**
+	 * name - The name of the testcase.
+	 */
+	const char *name;
+
+	/**
+	 * description - A description of your testcase: what it tests and is
+	 * meant to validate.
+	 */
+	const char *description;
+
+	/*
+	 * setup - Setup the test.
+	 * @ctx: A pointer to a context object that will be passed to run and
+	 *	 cleanup.
+	 *
+	 * An optional callback that allows a testcase to perform setup for its
+	 * run. A test may return SCX_TEST_SKIP to skip the run.
+	 */
+	enum scx_test_status (*setup)(void **ctx);
+
+	/*
+	 * run - Run the test.
+	 * @ctx: Context set in the setup() callback. If @ctx was not set in
+	 *	 setup(), it is NULL.
+	 *
+	 * The main test. Callers should return one of:
+	 *
+	 * - SCX_TEST_PASS: Test passed
+	 * - SCX_TEST_SKIP: Test should be skipped
+	 * - SCX_TEST_FAIL: Test failed
+	 *
+	 * This callback must be defined.
+	 */
+	enum scx_test_status (*run)(void *ctx);
+
+	/*
+	 * cleanup - Perform cleanup following the test
+	 * @ctx: Context set in the setup() callback. If @ctx was not set in
+	 *	 setup(), it is NULL.
+	 *
+	 * An optional callback that allows a test to perform cleanup after
+	 * being run. This callback is run even if the run() callback returns
+	 * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns
+	 * SCX_TEST_SKIP or SCX_TEST_FAIL.
+	 */
+	void (*cleanup)(void *ctx);
+};
+
+void scx_test_register(struct scx_test *test);
+
+#define REGISTER_SCX_TEST(__test)			\
+	__attribute__((constructor))			\
+	static void ___scxregister##__LINE__(void)	\
+	{						\
+		scx_test_register(__test);		\
+	}
+
+#define SCX_ERR(__fmt, ...)						\
+	do {								\
+		fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__);	\
+		fprintf(stderr, __fmt, ##__VA_ARGS__);			\
+	} while (0)
+
+#define SCX_FAIL(__fmt, ...)						\
+	do {								\
+		SCX_ERR(__fmt, ##__VA_ARGS__);				\
+		return SCX_TEST_FAIL;					\
+	} while (0)
+
+#define SCX_FAIL_IF(__cond, __fmt, ...)					\
+	do {								\
+		if (__cond)						\
+			SCX_FAIL(__fmt, ##__VA_ARGS__);			\
+	} while (0)
+
+#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)",	\
+				   #_x, #_y, (u64)(_x), (u64)(_y))
+#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)",		\
+				   #_x, (u64)(_x))
 
 #endif  // # __SCX_TEST_H__
diff --git a/tools/testing/selftests/scx/test_example.c b/tools/testing/selftests/scx/test_example.c
new file mode 100644
index 0000000000000..ce36cdf03cdc5
--- /dev/null
+++ b/tools/testing/selftests/scx/test_example.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_test.h"
+
+static bool setup_called = false;
+static bool run_called = false;
+static bool cleanup_called = false;
+
+static int context = 10;
+
+static enum scx_test_status setup(void **ctx)
+{
+	setup_called = true;
+	*ctx = &context;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	int *arg = ctx;
+
+	SCX_ASSERT(setup_called);
+	SCX_ASSERT(!run_called && !cleanup_called);
+	SCX_EQ(*arg, context);
+
+	run_called = true;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup (void *ctx)
+{
+	SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked");
+}
+
+struct scx_test example = {
+	.name		= "example",
+	.description	= "Validate the basic function of the test suite itself",
+	.setup		= setup,
+	.run		= run,
+	.cleanup	= cleanup,
+};
+REGISTER_SCX_TEST(&example)

From d5061f96f1daf21fbe89723e369c7537860c4159 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 10 Jan 2024 12:25:10 -0600
Subject: [PATCH 255/304] scx: Convert remaining testcases to use new framework

Now that the framework has been merged, let's update the remaining
testcases to use it.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/Makefile          | 22 +++----
 tools/testing/selftests/scx/select_cpu_dfl.c  | 46 ++++++++++----
 .../selftests/scx/select_cpu_dfl_nodispatch.c | 48 +++++++++-----
 .../selftests/scx/select_cpu_dispatch.c       | 46 +++++++++-----
 .../scx/select_cpu_dispatch_bad_dsq.c         | 63 ++++++++++---------
 .../scx/select_cpu_dispatch_dbl_dsp.c         | 63 ++++++++++---------
 .../testing/selftests/scx/select_cpu_vtime.c  | 42 +++++++++----
 7 files changed, 202 insertions(+), 128 deletions(-)

diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index c28f23c0442cc..c3bf6c19dccf0 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -147,20 +147,6 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 ################
 # C schedulers #
 ################
-c-sched-targets :=			\
-	select_cpu_dfl			\
-	select_cpu_dfl_nodispatch	\
-	select_cpu_dispatch		\
-	select_cpu_dispatch_bad_dsq	\
-	select_cpu_dispatch_dbl_dsp	\
-	select_cpu_vtime
-
-$(c-sched-targets): %: $(filter-out %.bpf.c,%.c) $(INCLUDE_DIR)/%.bpf.skel.h
-	$(eval sched=$(notdir $@))
-	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
-	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(LIBBPF_OUTPUT) $(LDFLAGS)
-
-TEST_GEN_PROGS := $(c-sched-targets)
 
 override define CLEAN
 	rm -rf $(OUTPUT_DIR)
@@ -176,6 +162,12 @@ auto-test-targets :=			\
 	ddsp_vtimelocal_fail		\
 	init_enable_count		\
 	minimal				\
+	select_cpu_dfl			\
+	select_cpu_dfl_nodispatch	\
+	select_cpu_dispatch		\
+	select_cpu_dispatch_bad_dsq	\
+	select_cpu_dispatch_dbl_dsp	\
+	select_cpu_vtime		\
 	test_example
 
 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
@@ -198,6 +190,8 @@ runner: $(SCXOBJ_DIR)/runner.o $(BPFOBJ) $(testcase-targets)
 	@echo "$(testcase-targets)"
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^
 
+TEST_GEN_PROGS := runner
+
 all: runner
 
 .PHONY: all clean help
diff --git a/tools/testing/selftests/scx/select_cpu_dfl.c b/tools/testing/selftests/scx/select_cpu_dfl.c
index 2962be1bec518..a53a40c2d2f0f 100644
--- a/tools/testing/selftests/scx/select_cpu_dfl.c
+++ b/tools/testing/selftests/scx/select_cpu_dfl.c
@@ -4,32 +4,35 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "select_cpu_dfl.bpf.skel.h"
 #include "scx_test.h"
 
 #define NUM_CHILDREN 1028
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dfl *skel;
+
+	skel = select_cpu_dfl__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dfl *skel = ctx;
 	struct bpf_link *link;
 	pid_t pids[NUM_CHILDREN];
 	int i, status;
 
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = select_cpu_dfl__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
-
 	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
 
 	for (i = 0; i < NUM_CHILDREN; i++) {
 		pids[i] = fork();
@@ -45,8 +48,25 @@ int main(int argc, char **argv)
 	}
 
 	SCX_ASSERT(!skel->bss->saw_local);
+
 	bpf_link__destroy(link);
-	select_cpu_dfl__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
 }
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dfl *skel = ctx;
+
+	select_cpu_dfl__destroy(skel);
+}
+
+struct scx_test select_cpu_dfl = {
+	.name = "select_cpu_dfl",
+	.description = "Verify the default ops.select_cpu() dispatches tasks "
+		       "when idles cores are found, and skips ops.enqueue()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dfl)
diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c
index 3121b28c81ed0..1d85bf4bf3a39 100644
--- a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c
+++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c
@@ -1,35 +1,38 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "select_cpu_dfl_nodispatch.bpf.skel.h"
 #include "scx_test.h"
 
 #define NUM_CHILDREN 1028
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dfl_nodispatch *skel;
+
+	skel = select_cpu_dfl_nodispatch__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dfl_nodispatch *skel = ctx;
 	struct bpf_link *link;
 	pid_t pids[NUM_CHILDREN];
 	int i, status;
 
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = select_cpu_dfl_nodispatch__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
-
 	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
 
 	for (i = 0; i < NUM_CHILDREN; i++) {
 		pids[i] = fork();
@@ -45,8 +48,25 @@ int main(int argc, char **argv)
 	}
 
 	SCX_ASSERT(skel->bss->saw_local);
+
 	bpf_link__destroy(link);
-	select_cpu_dfl_nodispatch__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dfl_nodispatch *skel = ctx;
+
+	select_cpu_dfl_nodispatch__destroy(skel);
 }
+
+struct scx_test select_cpu_dfl_nodispatch = {
+	.name = "select_cpu_dfl_nodispatch",
+	.description = "Verify behavior of scx_bpf_select_cpu_dfl() in "
+		       "ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch)
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.c b/tools/testing/selftests/scx/select_cpu_dispatch.c
index a3625f75db720..0309ca8785b36 100644
--- a/tools/testing/selftests/scx/select_cpu_dispatch.c
+++ b/tools/testing/selftests/scx/select_cpu_dispatch.c
@@ -4,32 +4,35 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "select_cpu_dispatch.bpf.skel.h"
 #include "scx_test.h"
 
 #define NUM_CHILDREN 1028
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dispatch *skel;
+
+	skel = select_cpu_dispatch__open_and_load();
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dispatch *skel = ctx;
 	struct bpf_link *link;
 	pid_t pids[NUM_CHILDREN];
 	int i, status;
 
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = select_cpu_dispatch__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
-
 	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
 
 	for (i = 0; i < NUM_CHILDREN; i++) {
 		pids[i] = fork();
@@ -44,9 +47,24 @@ int main(int argc, char **argv)
 		SCX_EQ(status, 0);
 	}
 
-
 	bpf_link__destroy(link);
-	select_cpu_dispatch__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
 }
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dispatch *skel = ctx;
+
+	select_cpu_dispatch__destroy(skel);
+}
+
+struct scx_test select_cpu_dispatch = {
+	.name = "select_cpu_dispatch",
+	.description = "Test direct dispatching to built-in DSQs from "
+		       "ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dispatch)
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
index f1094e3645d61..4b62ff69c203d 100644
--- a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
@@ -4,54 +4,55 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "select_cpu_dispatch_bad_dsq.bpf.skel.h"
 #include "scx_test.h"
 
-#define NUM_CHILDREN 1028
 #define SCX_EXIT_ERROR 1024
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dispatch_bad_dsq *skel;
-	struct bpf_link *link;
-	pid_t pids[NUM_CHILDREN];
-	int i, status;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = select_cpu_dispatch_bad_dsq__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
 
-	/*
-	 * The scheduler is expected to gracefully exit after bad_dsqoneously
-	 * double-dispatching from ops.selec_cpu().
-	 */
-	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	return SCX_TEST_PASS;
+}
 
-	for (i = 0; i < NUM_CHILDREN; i++) {
-		pids[i] = fork();
-		if (pids[i] == 0) {
-			sleep(1);
-			exit(0);
-		}
-	}
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dispatch_bad_dsq *skel = ctx;
+	struct bpf_link *link;
 
-	for (i = 0; i < NUM_CHILDREN; i++) {
-		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
-		SCX_EQ(status, 0);
-	}
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	sleep(1);
 
 	SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR);
 	bpf_link__destroy(link);
-	select_cpu_dispatch_bad_dsq__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dispatch_bad_dsq *skel = ctx;
+
+	select_cpu_dispatch_bad_dsq__destroy(skel);
 }
+
+struct scx_test select_cpu_dispatch_bad_dsq = {
+	.name = "select_cpu_dispatch_bad_dsq",
+	.description = "Verify graceful failure if we direct-dispatch to a "
+		       "bogus DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq)
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
index 9736b65f79bd0..86aa2180f2a1d 100644
--- a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
@@ -4,54 +4,55 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <signal.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "select_cpu_dispatch_dbl_dsp.bpf.skel.h"
 #include "scx_test.h"
 
-#define NUM_CHILDREN 1028
 #define SCX_EXIT_ERROR 1024
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dispatch_dbl_dsp *skel;
-	struct bpf_link *link;
-	pid_t pids[NUM_CHILDREN];
-	int i, status;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = select_cpu_dispatch_dbl_dsp__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
 
-	/*
-	 * The scheduler is expected to gracefully exit after
-	 * double-dispatching from ops.select_cpu().
-	 */
-	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	return SCX_TEST_PASS;
+}
 
-	for (i = 0; i < NUM_CHILDREN; i++) {
-		pids[i] = fork();
-		if (pids[i] == 0) {
-			sleep(1);
-			exit(0);
-		}
-	}
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_dispatch_dbl_dsp *skel = ctx;
+	struct bpf_link *link;
 
-	for (i = 0; i < NUM_CHILDREN; i++) {
-		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
-		SCX_EQ(status, 0);
-	}
+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	sleep(1);
 
 	SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR);
 	bpf_link__destroy(link);
-	select_cpu_dispatch_dbl_dsp__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_dispatch_dbl_dsp *skel = ctx;
+
+	select_cpu_dispatch_dbl_dsp__destroy(skel);
 }
+
+struct scx_test select_cpu_dispatch_dbl_dsp = {
+	.name = "select_cpu_dispatch_dbl_dsp",
+	.description = "Verify graceful failure if we dispatch twice to a "
+		       "DSQ in ops.select_cpu()",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp)
diff --git a/tools/testing/selftests/scx/select_cpu_vtime.c b/tools/testing/selftests/scx/select_cpu_vtime.c
index 6f72f0625478c..b4629c2364f5d 100644
--- a/tools/testing/selftests/scx/select_cpu_vtime.c
+++ b/tools/testing/selftests/scx/select_cpu_vtime.c
@@ -4,36 +4,56 @@
  * Copyright (c) 2024 David Vernet <dvernet@meta.com>
  * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
+#include <unistd.h>
 #include "select_cpu_vtime.bpf.skel.h"
 #include "scx_test.h"
 
-int main(int argc, char **argv)
+static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_vtime *skel;
-	struct bpf_link *link;
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
 	skel = select_cpu_vtime__open_and_load();
-	SCX_BUG_ON(!skel, "Failed to open and load skel");
+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct select_cpu_vtime *skel = ctx;
+	struct bpf_link *link;
 
 	SCX_ASSERT(!skel->bss->consumed);
 
 	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
 
 	sleep(1);
 
 	SCX_ASSERT(skel->bss->consumed);
 
 	bpf_link__destroy(link);
-	select_cpu_vtime__destroy(skel);
 
-	return 0;
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct select_cpu_vtime *skel = ctx;
+
+	select_cpu_vtime__destroy(skel);
 }
+
+struct scx_test select_cpu_vtime = {
+	.name = "select_cpu_vtime",
+	.description = "Test doing direct vtime-dispatching from "
+		       "ops.select_cpu(), to a non-built-in DSQ",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&select_cpu_vtime)

From 1fa672f51a29a0cc3596992720aecd9b5352b361 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 10 Jan 2024 12:29:02 -0600
Subject: [PATCH 256/304] scx: Update ddsp testcases to check for error exits

We're checking that we don't crash when we encounter these error
conditions, but let's also test that we exit with the expected error
condition. The next patch will update this to be built into the test
framework.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c  | 8 ++++++++
 tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c      | 4 ++++
 tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c | 8 ++++++++
 tools/testing/selftests/scx/ddsp_vtimelocal_fail.c     | 4 ++++
 4 files changed, 24 insertions(+)

diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c
index 78bd8feace050..dd32b189911ef 100644
--- a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c
+++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c
@@ -8,6 +8,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+struct user_exit_info uei;
+
 s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -26,6 +28,11 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
 	return prev_cpu;
 }
 
+void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
 s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_init)
 {
 	scx_bpf_switch_all();
@@ -36,6 +43,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_init)
 SEC(".struct_ops.link")
 struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
 	.select_cpu		= ddsp_bogus_dsq_fail_select_cpu,
+	.exit			= ddsp_bogus_dsq_fail_exit,
 	.init			= ddsp_bogus_dsq_fail_init,
 	.name			= "ddsp_bogus_dsq_fail",
 	.timeout_ms		= 1000U,
diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
index b12e804a0b664..0310fa58f31be 100644
--- a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
+++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
@@ -11,6 +11,8 @@
 #include "ddsp_bogus_dsq_fail.bpf.skel.h"
 #include "scx_test.h"
 
+#define SCX_EXIT_ERROR 1024
+
 static enum scx_test_status setup(void **ctx)
 {
 	struct ddsp_bogus_dsq_fail *skel;
@@ -31,6 +33,8 @@ static enum scx_test_status run(void *ctx)
 	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
 
 	sleep(1);
+
+	SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR);
 	bpf_link__destroy(link);
 
 	return SCX_TEST_PASS;
diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c
index 14a40be192c5d..9b21c1d57861c 100644
--- a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c
+++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c
@@ -8,6 +8,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+struct user_exit_info uei;
+
 s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -23,6 +25,11 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
 	return prev_cpu;
 }
 
+void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
 s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_init)
 {
 	scx_bpf_switch_all();
@@ -34,6 +41,7 @@ SEC(".struct_ops.link")
 struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
 	.select_cpu		= ddsp_vtimelocal_fail_select_cpu,
 	.init			= ddsp_vtimelocal_fail_init,
+	.exit			= ddsp_vtimelocal_fail_exit,
 	.name			= "ddsp_vtimelocal_fail",
 	.timeout_ms		= 1000U,
 };
diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
index 03bbadfaaf2f1..211e84886b4d6 100644
--- a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
+++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
@@ -10,6 +10,8 @@
 #include "ddsp_vtimelocal_fail.bpf.skel.h"
 #include "scx_test.h"
 
+#define SCX_EXIT_ERROR 1024
+
 static enum scx_test_status setup(void **ctx)
 {
 	struct ddsp_vtimelocal_fail *skel;
@@ -30,6 +32,8 @@ static enum scx_test_status run(void *ctx)
 	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
 
 	sleep(1);
+
+	SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR);
 	bpf_link__destroy(link);
 
 	return SCX_TEST_PASS;

From 8d7a79e80117059d1912d88dd8c551cbd830ce7b Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 10 Jan 2024 12:39:33 -0600
Subject: [PATCH 257/304] scx: Copy scx_exit_kind to scx_test.h

Rather than define the error value in each test, let's just define it in
scx_test.h.

Signed-off-by: David Vernet <void@manifault.com>
---
 tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c   |  2 --
 tools/testing/selftests/scx/ddsp_vtimelocal_fail.c  |  2 --
 tools/testing/selftests/scx/scx_test.h              | 13 +++++++++++++
 .../selftests/scx/select_cpu_dispatch_bad_dsq.c     |  2 --
 .../selftests/scx/select_cpu_dispatch_dbl_dsp.c     |  2 --
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
index 0310fa58f31be..ef8ee04ff9871 100644
--- a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
+++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c
@@ -11,8 +11,6 @@
 #include "ddsp_bogus_dsq_fail.bpf.skel.h"
 #include "scx_test.h"
 
-#define SCX_EXIT_ERROR 1024
-
 static enum scx_test_status setup(void **ctx)
 {
 	struct ddsp_bogus_dsq_fail *skel;
diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
index 211e84886b4d6..b55611cd0b1fb 100644
--- a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
+++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c
@@ -10,8 +10,6 @@
 #include "ddsp_vtimelocal_fail.bpf.skel.h"
 #include "scx_test.h"
 
-#define SCX_EXIT_ERROR 1024
-
 static enum scx_test_status setup(void **ctx)
 {
 	struct ddsp_vtimelocal_fail *skel;
diff --git a/tools/testing/selftests/scx/scx_test.h b/tools/testing/selftests/scx/scx_test.h
index e402031c2a84e..4b70bf75fa814 100644
--- a/tools/testing/selftests/scx/scx_test.h
+++ b/tools/testing/selftests/scx/scx_test.h
@@ -17,6 +17,19 @@ enum scx_test_status {
 	SCX_TEST_FAIL,
 };
 
+/* Copied from include/linux/sched/ext.h */
+enum scx_test_exit_kind {
+        SCX_EXIT_NONE,
+        SCX_EXIT_DONE,
+
+        SCX_EXIT_UNREG = 64,    /* BPF unregistration */
+        SCX_EXIT_SYSRQ,         /* requested by 'S' sysrq */
+
+	SCX_EXIT_ERROR = 1024,  /* runtime error, error msg contains details */
+	SCX_EXIT_ERROR_BPF,     /* ERROR but triggered through scx_bpf_error() */
+	SCX_EXIT_ERROR_STALL,   /* watchdog detected stalled runnable tasks */
+};
+
 struct scx_test {
 	/**
 	 * name - The name of the testcase.
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
index 4b62ff69c203d..a7b91d58cb318 100644
--- a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c
@@ -11,8 +11,6 @@
 #include "select_cpu_dispatch_bad_dsq.bpf.skel.h"
 #include "scx_test.h"
 
-#define SCX_EXIT_ERROR 1024
-
 static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dispatch_bad_dsq *skel;
diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
index 86aa2180f2a1d..e32b229637448 100644
--- a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
+++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c
@@ -11,8 +11,6 @@
 #include "select_cpu_dispatch_dbl_dsp.bpf.skel.h"
 #include "scx_test.h"
 
-#define SCX_EXIT_ERROR 1024
-
 static enum scx_test_status setup(void **ctx)
 {
 	struct select_cpu_dispatch_dbl_dsp *skel;

From 4bbb07ccaa10a13ad5ea8ff0d3fe45ffaaa831dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jan 2024 13:47:22 -1000
Subject: [PATCH 258/304] scx: Narrow cpus_read_lock() critical section in
 scx_ops_enable()

cpus_read_lock() is needed for two purposes in scx_ops_enable(). First, to
keep CPUs stable between ops.init() and enabling of ops.cpu_on/offline().
Second, to work around the locking order issue between scx_cgroup_rwsem and
cpu_hotplug_lock caused by static_branch_*().

Currently, scx_ops_enable() acquires cpus_read_lock() and holds it through
most of ops enabling covering both use cases. This makes it difficult to
understand what lock is held where and resolve locking order issues among
these system-wide locks.

Let's separate out the two sections so that ops.init() and
ops.cpu_on/offline() enabling are contained in its own critical section and
cpus_read_lock() is droped and then reacquired for the second use case.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 42 +++++++++++++++++--------------
 kernel/sched/ext.c        | 53 ++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index e629686cf0621..ae552129931a9 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -412,24 +412,6 @@ struct sched_ext_ops {
 	 */
 	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
 
-	/**
-	 * cpu_online - A CPU became online
-	 * @cpu: CPU which just came up
-	 *
-	 * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
-	 * associated with other CPUs beforehand.
-	 */
-	void (*cpu_online)(s32 cpu);
-
-	/**
-	 * cpu_offline - A CPU is going offline
-	 * @cpu: CPU which is going offline
-	 *
-	 * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
-	 * associated with other CPUs afterwards.
-	 */
-	void (*cpu_offline)(s32 cpu);
-
 	/**
 	 * init_task - Initialize a task to run in a BPF scheduler
 	 * @p: task to initialize for BPF scheduling
@@ -550,7 +532,29 @@ struct sched_ext_ops {
 #endif	/* CONFIG_CGROUPS */
 
 	/*
-	 * All online ops must come before ops.init().
+	 * All online ops must come before ops.cpu_online().
+	 */
+
+	/**
+	 * cpu_online - A CPU became online
+	 * @cpu: CPU which just came up
+	 *
+	 * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
+	 * associated with other CPUs beforehand.
+	 */
+	void (*cpu_online)(s32 cpu);
+
+	/**
+	 * cpu_offline - A CPU is going offline
+	 * @cpu: CPU which is going offline
+	 *
+	 * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
+	 * associated with other CPUs afterwards.
+	 */
+	void (*cpu_offline)(s32 cpu);
+
+	/*
+	 * All CPU hotplug ops must come before ops.init().
 	 */
 
 	/**
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dd09b53254f59..f8889a82267ae 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,10 +9,15 @@
 #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
 
 enum scx_internal_consts {
-	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
-	SCX_DSP_DFL_MAX_BATCH	= 32,
-	SCX_DSP_MAX_LOOPS	= 32,
-	SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+	SCX_OPI_BEGIN			= 0,
+	SCX_OPI_NORMAL_BEGIN		= 0,
+	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
+	SCX_OPI_END			= SCX_OP_IDX(init),
+	SCX_DSP_DFL_MAX_BATCH		= 32,
+	SCX_DSP_MAX_LOOPS		= 32,
+	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
 };
 
 enum scx_ops_enable_state {
@@ -104,8 +109,8 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
 DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 
-struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
-	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
+struct static_key_false scx_has_op[SCX_OPI_END] =
+	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 
 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info scx_exit_info;
@@ -3228,7 +3233,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	/* no task is on scx, turn off all the switches and flush in-progress calls */
 	static_branch_disable_cpuslocked(&__scx_ops_enabled);
-	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
+	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
 		static_branch_disable_cpuslocked(&scx_has_op[i]);
 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
@@ -3373,13 +3378,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			   scx_create_rt_helper("sched_ext_ops_helper"));
 		if (!scx_ops_helper) {
 			ret = -ENOMEM;
-			goto err_unlock;
+			goto err;
 		}
 	}
 
 	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
 		ret = -EBUSY;
-		goto err_unlock;
+		goto err;
 	}
 
 	/*
@@ -3408,7 +3413,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
-			goto err_disable;
+			goto err_disable_unlock_cpus;
 		}
 
 		/*
@@ -3420,9 +3425,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		 * ops.exit() like other scx_bpf_error() invocations.
 		 */
 		if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE)
-			goto err_disable;
+			goto err_disable_unlock_cpus;
 	}
 
+	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+		if (((void (**)(void))ops)[i])
+			static_branch_enable_cpuslocked(&scx_has_op[i]);
+
+	cpus_read_unlock();
+
 	ret = validate_ops(ops);
 	if (ret)
 		goto err_disable;
@@ -3450,10 +3461,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 * Lock out forks, cgroup on/offlining and moves before opening the
 	 * floodgate so that they don't wander into the operations prematurely.
 	 */
+	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
 
-	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
+	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
 
@@ -3478,7 +3490,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	ret = scx_cgroup_init();
 	if (ret)
-		goto err_disable_unlock;
+		goto err_disable_unlock_all;
 
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
@@ -3504,7 +3516,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			spin_unlock_irq(&scx_tasks_lock);
 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
-			goto err_disable_unlock;
+			goto err_disable_unlock_all;
 		}
 
 		put_task_struct(p);
@@ -3528,7 +3540,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		preempt_enable();
 		spin_unlock_irq(&scx_tasks_lock);
 		ret = -EBUSY;
-		goto err_disable_unlock;
+		goto err_disable_unlock_all;
 	}
 
 	/*
@@ -3564,6 +3576,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	preempt_enable();
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
+	cpus_read_unlock();
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;
@@ -3571,24 +3584,24 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	}
 
 	if (scx_switch_all_req)
-		static_branch_enable_cpuslocked(&__scx_switched_all);
+		static_branch_enable(&__scx_switched_all);
 
-	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
 
 	return 0;
 
-err_unlock:
+err:
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
-err_disable_unlock:
+err_disable_unlock_all:
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
-err_disable:
+err_disable_unlock_cpus:
 	cpus_read_unlock();
+err_disable:
 	mutex_unlock(&scx_ops_enable_mutex);
 	/* must be fully disabled before returning */
 	scx_ops_disable(SCX_EXIT_ERROR);

From 1225a9069ff2185007b078b7e447727d229a64fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jan 2024 13:47:22 -1000
Subject: [PATCH 259/304] scx: Reorder scx_fork_rwsem, cpu_hotplug_lock and
 scx_cgroup_rwsem

scx_cgroup_rwsem and scx_fork_rwsem, respectively, are in the following
locking dependency chain.

  cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
  scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock

And we need to flip static_key which requires CPUs stable. The only locking
order which satifies all three requirements is

  scx_fork_rwsem --> cpu_hotplug_lock --> scx_cgroup_rwsem

Reorder locking in scx_ops_enable() and scx_ops_disable_workfn().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f8889a82267ae..4ae373743911a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3201,9 +3201,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable(&__scx_switched_all);
 	WRITE_ONCE(scx_switching_all, false);
 
-	/* avoid racing against fork and cgroup changes */
-	cpus_read_lock();
+	/*
+	 * Avoid racing against fork and cgroup changes. See scx_ops_enable()
+	 * for explanation on the locking order.
+	 */
 	percpu_down_write(&scx_fork_rwsem);
+	cpus_read_lock();
 	scx_cgroup_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
@@ -3244,8 +3247,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_cgroup_exit();
 
 	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3460,9 +3463,23 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	/*
 	 * Lock out forks, cgroup on/offlining and moves before opening the
 	 * floodgate so that they don't wander into the operations prematurely.
+	 *
+	 * We don't need to keep the CPUs stable but static_branch_*() requires
+	 * cpus_read_lock() and scx_cgroup_rwsem must nest inside
+	 * cpu_hotplug_lock because of the following dependency chain:
+	 *
+	 *   cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
+	 *
+	 * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
+	 * static_branch_*_cpuslocked().
+	 *
+	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
+	 * following dependency chain:
+	 *
+	 *   scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
 	 */
-	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
+	cpus_read_lock();
 	scx_cgroup_lock();
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
@@ -3575,8 +3592,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
 	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;

From dfb12104e2ca6672c7be20913875082487e6eaf0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jan 2024 14:05:14 -1000
Subject: [PATCH 260/304] scx: Sync from scx repo

b32d73ae4e19 ("Merge pull request #82 from sched-ext/htejun")
---
 tools/sched_ext/scx_flatcg.bpf.c | 92 +++++++++++++++++++++-----------
 tools/sched_ext/scx_flatcg.c     | 24 +++++----
 tools/sched_ext/scx_flatcg.h     |  2 +
 3 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 869115805b288..d6a947bc98151 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -46,6 +46,11 @@
 #include <scx/common.bpf.h>
 #include "scx_flatcg.h"
 
+/*
+ * Maximum amount of retries to find a valid cgroup.
+ */
+#define CGROUP_MAX_RETRIES 1024
+
 char _license[] SEC("license") = "GPL";
 
 const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
@@ -302,6 +307,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	bpf_spin_unlock(&cgv_tree_lock);
 }
 
+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
+{
+	/*
+	 * Tell fcg_stopping() that this bypassed the regular scheduling path
+	 * and should be force charged to the cgroup. 0 is used to indicate that
+	 * the task isn't bypassing, so if the current runtime is 0, go back by
+	 * one nanosecond.
+	 */
+	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
+}
+
 s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	struct fcg_task_ctx *taskc;
@@ -319,35 +335,12 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
 	/*
 	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
 	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
-	 * the fact. Use the same mechanism to deal with tasks with custom
-	 * affinities so that we don't have to worry about per-cgroup dq's
-	 * containing tasks that can't be executed from some CPUs.
+	 * the fact.
 	 */
-	if (is_idle || p->nr_cpus_allowed != nr_cpus) {
-		/*
-		 * Tell fcg_stopping() that this bypassed the regular scheduling
-		 * path and should be force charged to the cgroup. 0 is used to
-		 * indicate that the task isn't bypassing, so if the current
-		 * runtime is 0, go back by one nanosecond.
-		 */
-		taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
-
-		/*
-		 * The global dq is deprioritized as we don't want to let tasks
-		 * to boost themselves by constraining its cpumask. The
-		 * deprioritization is rather severe, so let's not apply that to
-		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
-		 * implement per-cgroup fallback dq's instead so that we have
-		 * more control over when tasks with custom cpumask get issued.
-		 */
-		if (is_idle ||
-		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
-			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
-		} else {
-			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-		}
+	if (is_idle) {
+		set_bypassed_at(p, taskc);
+		stat_inc(FCG_STAT_LOCAL);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
 	return cpu;
@@ -365,6 +358,32 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * Use the direct dispatching and force charging to deal with tasks with
+	 * custom affinities so that we don't have to worry about per-cgroup
+	 * dq's containing tasks that can't be executed from some CPUs.
+	 */
+	if (p->nr_cpus_allowed != nr_cpus) {
+		set_bypassed_at(p, taskc);
+
+		/*
+		 * The global dq is deprioritized as we don't want to let tasks
+		 * to boost themselves by constraining its cpumask. The
+		 * deprioritization is rather severe, so let's not apply that to
+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
+		 * implement per-cgroup fallback dq's instead so that we have
+		 * more control over when tasks with custom cpumask get issued.
+		 */
+		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
+			stat_inc(FCG_STAT_LOCAL);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		} else {
+			stat_inc(FCG_STAT_GLOBAL);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		}
+		return;
+	}
+
 	cgrp = scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (!cgc)
@@ -691,6 +710,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 		bpf_spin_lock(&cgv_tree_lock);
 		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
 		bpf_spin_unlock(&cgv_tree_lock);
+		stat_inc(FCG_STAT_PNC_RACE);
 	} else {
 		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
 		if (cgv_node) {
@@ -712,6 +732,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	struct fcg_cgrp_ctx *cgc;
 	struct cgroup *cgrp;
 	u64 now = bpf_ktime_get_ns();
+	bool picked_next = false;
 
 	cpuc = find_cpu_ctx();
 	if (!cpuc)
@@ -766,10 +787,21 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 		return;
 	}
 
-	bpf_repeat(BPF_MAX_LOOPS) {
-		if (try_pick_next_cgroup(&cpuc->cur_cgid))
+	bpf_repeat(CGROUP_MAX_RETRIES) {
+		if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
+			picked_next = true;
 			break;
+		}
 	}
+
+	/*
+	 * This only happens if try_pick_next_cgroup() races against enqueue
+	 * path for more than CGROUP_MAX_RETRIES times, which is extremely
+	 * unlikely and likely indicates an underlying bug. There shouldn't be
+	 * any stall risk as the race is against enqueue.
+	 */
+	if (!picked_next)
+		stat_inc(FCG_STAT_PNC_FAIL);
 }
 
 s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index b326b2d3ec350..6c2f9715f6925 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -186,29 +186,31 @@ int main(int argc, char **argv)
 
 		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
 		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
-		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
+		printf("       act:%6llu  deact:%6llu global:%6llu local:%6llu\n",
 		       stats[FCG_STAT_ACT],
 		       stats[FCG_STAT_DEACT],
-		       stats[FCG_STAT_LOCAL],
-		       stats[FCG_STAT_GLOBAL]);
-		printf("HWT   skip:%6llu   race:%6llu cache:%6llu update:%6llu\n",
-		       stats[FCG_STAT_HWT_SKIP],
-		       stats[FCG_STAT_HWT_RACE],
+		       stats[FCG_STAT_GLOBAL],
+		       stats[FCG_STAT_LOCAL]);
+		printf("HWT  cache:%6llu update:%6llu   skip:%6llu  race:%6llu\n",
 		       stats[FCG_STAT_HWT_CACHE],
-		       stats[FCG_STAT_HWT_UPDATES]);
+		       stats[FCG_STAT_HWT_UPDATES],
+		       stats[FCG_STAT_HWT_SKIP],
+		       stats[FCG_STAT_HWT_RACE]);
 		printf("ENQ   skip:%6llu   race:%6llu\n",
 		       stats[FCG_STAT_ENQ_SKIP],
 		       stats[FCG_STAT_ENQ_RACE]);
-		printf("CNS   keep:%6llu expire:%6llu empty:%6llu   gone:%6llu\n",
+		printf("CNS   keep:%6llu expire:%6llu  empty:%6llu  gone:%6llu\n",
 		       stats[FCG_STAT_CNS_KEEP],
 		       stats[FCG_STAT_CNS_EXPIRE],
 		       stats[FCG_STAT_CNS_EMPTY],
 		       stats[FCG_STAT_CNS_GONE]);
-		printf("PNC nocgrp:%6llu   next:%6llu empty:%6llu   gone:%6llu\n",
-		       stats[FCG_STAT_PNC_NO_CGRP],
+		printf("PNC   next:%6llu  empty:%6llu nocgrp:%6llu  gone:%6llu race:%6llu fail:%6llu\n",
 		       stats[FCG_STAT_PNC_NEXT],
 		       stats[FCG_STAT_PNC_EMPTY],
-		       stats[FCG_STAT_PNC_GONE]);
+		       stats[FCG_STAT_PNC_NO_CGRP],
+		       stats[FCG_STAT_PNC_GONE],
+		       stats[FCG_STAT_PNC_RACE],
+		       stats[FCG_STAT_PNC_FAIL]);
 		printf("BAD remove:%6llu\n",
 		       acc_stats[FCG_STAT_BAD_REMOVAL]);
 		fflush(stdout);
diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
index 490758ed41f0f..6f2ea50acb1cb 100644
--- a/tools/sched_ext/scx_flatcg.h
+++ b/tools/sched_ext/scx_flatcg.h
@@ -28,6 +28,8 @@ enum fcg_stat_idx {
 	FCG_STAT_PNC_NEXT,
 	FCG_STAT_PNC_EMPTY,
 	FCG_STAT_PNC_GONE,
+	FCG_STAT_PNC_RACE,
+	FCG_STAT_PNC_FAIL,
 
 	FCG_STAT_BAD_REMOVAL,
 

From 74cdbb0a69905c7d994ce72d26eb5d84c3031030 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Tue, 9 Jan 2024 10:47:33 +0100
Subject: [PATCH 261/304] ci: add github workflow to test the sched-ext kernel

Add a github action to test the sched-ext kernel with all the shipped
schedulers.

The test uses a similar approach to the scx workflow [1], using
virtme-ng to run each scheduler inside a sched-ext enabled kernel for a
certain amount of time (30 sec) and checking for potential stall, oops
or bug conditions.

In this case we can use `virtme-ng --build` to build a kernel with bare
minimum support to run inside virtme-ng itself, instead of generating a
fully featured kernel, to expedite the testing process.

The mandatory .config options required by sched-ext are stored in
`.github/workflows/sched-ext.config` and they are passed to virtme-ng
via the `--config` option.

The test itself is defined in `.github/workflows/run-schedulers`: the
script looks for all the binaries in `tools/sched_ext/build/bin` and
runs each one in a separate virtme-ng instance, to ensure that each run
does not impact the others.

[1] https://github.com/sched-ext/scx/blob/main/.github/workflows/build-scheds.yml

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 .github/workflows/run-schedulers   | 51 ++++++++++++++++++++++++++++++
 .github/workflows/sched-ext.config | 34 ++++++++++++++++++++
 .github/workflows/test-kernel.yml  | 47 +++++++++++++++++++++++++++
 .gitignore                         |  3 ++
 4 files changed, 135 insertions(+)
 create mode 100755 .github/workflows/run-schedulers
 create mode 100644 .github/workflows/sched-ext.config
 create mode 100644 .github/workflows/test-kernel.yml

diff --git a/.github/workflows/run-schedulers b/.github/workflows/run-schedulers
new file mode 100755
index 0000000000000..0aa3ffea3bff7
--- /dev/null
+++ b/.github/workflows/run-schedulers
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Run sched-ext scheduler for TIMEOUT seconds inside virtme-ng and catch
+# potential errors, then unload the scheduler and return the exit status.
+
+# Maximum time for each scheduler run.
+TEST_TIMEOUT=30
+
+# Maximum timeout for the guest used for each scheduler run (this is used to
+# hard-shutdown the guest in case of system hangs).
+GUEST_TIMEOUT=60
+
+# Check if virtme-ng is available.
+if [ ! -x `which vng` ]; then
+    echo "vng not found, please install virtme-ng to enable testing"
+    exit 1
+fi
+
+# Test all the available schedulers.
+#
+# NOTE: virtme-ng automatically runs the kernel from the current working
+# directory by default.
+#
+# Each scheduler will be tested in a separate instance booted from scratch, to
+# ensure that each run does not impact the others.
+#
+# TODO: exclude scx_layered for now, because it requires a special config
+# file, otherwise its test would fail with "Error: No layer spec".
+#
+# Maybe in the future change scx_layered to run with a default layer spec, just
+# for testing it.
+#
+for sched in $(find tools/sched_ext/build/bin -type f -executable | grep -v scx_layered); do
+    rm -f /tmp/output
+    (timeout --foreground --preserve-status ${GUEST_TIMEOUT} \
+        vng --force-9p --disable-microvm --verbose -- \
+            "timeout --foreground --preserve-status ${TEST_TIMEOUT} ${sched}" \
+                2>&1 </dev/null || true) | tee /tmp/output
+    sed -n -e '/\bBUG:/q1' \
+           -e '/\bWARNING:/q1' \
+           -e '/\berror\b/Iq1' \
+           -e '/\bstall/Iq1' \
+           -e '/\btimeout\b/Iq1' /tmp/output
+    res=$?
+    if [ ${res} -ne 0 ]; then
+        echo "FAIL: ${sched}"
+        exit 1
+    else
+        echo "OK: ${sched}"
+    fi
+done
diff --git a/.github/workflows/sched-ext.config b/.github/workflows/sched-ext.config
new file mode 100644
index 0000000000000..efb7bda8b8fa5
--- /dev/null
+++ b/.github/workflows/sched-ext.config
@@ -0,0 +1,34 @@
+# sched-ext mandatory options
+#
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_DEBUG_INFO_BTF=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT_DEFAULT_ON=y
+CONFIG_SCHED_CLASS_EXT=y
+
+# Enable scheduling debugging
+#
+CONFIG_SCHED_DEBUG=y
+
+# Enable extra scheduling features (for a better code coverage while testing
+# the schedulers)
+#
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_SCHED_CORE=y
+
+# Enable fully preemptible kernel for a better test coverage of the schedulers
+#
+# CONFIG_PREEMPT_NONE is not set
+# CONFIG_PREEMPT_VOLUNTARY is not set
+CONFIG_PREEMPT=y
+CONFIG_PREEMPT_COUNT=y
+CONFIG_PREEMPTION=y
+CONFIG_PREEMPT_DYNAMIC=y
+CONFIG_PREEMPT_RCU=y
+
+# Additional debugging information (useful to catch potential locking issues)
+#
+CONFIG_DEBUG_LOCKDEP=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
diff --git a/.github/workflows/test-kernel.yml b/.github/workflows/test-kernel.yml
new file mode 100644
index 0000000000000..f27068d2eaaf8
--- /dev/null
+++ b/.github/workflows/test-kernel.yml
@@ -0,0 +1,47 @@
+name: test-kernel
+run-name: ${{ github.actor }} PR run
+on: [pull_request, push]
+jobs:
+  test-schedulers:
+    runs-on: ubuntu-22.04
+    steps:
+      ### OTHER REPOS ####
+
+      # Hard turn-off interactive mode
+      - run: echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
+
+      # Refresh packages list
+      - run: sudo apt update
+
+      ### DOWNLOAD AND INSTALL DEPENDENCIES ###
+
+      # Download dependencies packaged by Ubuntu
+      - run: sudo apt -y install gcc make git coreutils cmake elfutils libelf-dev libunwind-dev libzstd-dev linux-headers-generic linux-tools-common linux-tools-generic ninja-build python3-pip python3-requests qemu-kvm udev iproute2 busybox-static libvirt-clients kbd kmod file rsync zstd pahole flex bison cpio libcap-dev libelf-dev python3-dev cargo rustc
+
+      # clang 17
+      # Use a custom llvm.sh script which includes the -y flag for
+      # add-apt-repository. Otherwise, the CI job will hang. If and when
+      # https://github.com/opencollab/llvm-jenkins.debian.net/pull/26 is
+      # merged, we can go back to using https://apt.llvm.org/llvm.sh.
+      - run: wget https://raw.githubusercontent.com/Decave/llvm-jenkins.debian.net/fix_llvmsh/llvm.sh
+      - run: chmod +x llvm.sh
+      - run: sudo ./llvm.sh all
+      - run: sudo ln -sf /usr/bin/clang-17 /usr/bin/clang
+      - run: sudo ln -sf /usr/bin/llvm-strip-17 /usr/bin/llvm-strip
+
+      # Checkout repository
+      - uses: actions/checkout@v4
+
+      # Install virtme-ng
+      - run: pip install virtme-ng
+
+      ### END DEPENDENCIES ###
+
+      # Build a minimal kernel (with sched-ext enabled) using virtme-ng
+      - run: vng -v --build --config .github/workflows/sched-ext.config
+
+      # Build the in-kernel schedulers
+      - run: cd tools/sched_ext && make
+
+      # Test the schedulers inside the recompile kernel
+      - run: .github/workflows/run-schedulers
diff --git a/.gitignore b/.gitignore
index 98274e1160d7b..c1e998efd0600 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,6 @@ sphinx_*/
 
 # Rust analyzer configuration
 /rust-project.json
+
+# Include ".github" directory
+!.github/

From 7d420b55391c7a940b87e2db0cdd87ecb994cda7 Mon Sep 17 00:00:00 2001
From: Thinker Lee <thinker.li@gmail.com>
Date: Tue, 16 Jan 2024 17:53:19 -0800
Subject: [PATCH 262/304] scx: Make the pointer passing to .dispatch
 MAYBE_NULL.

The struct task_struct pointer passing to .dispatch can be NULL.
However, we assume that the pointers passing to a struct_ops programs
are always trusted (PTR_TRUSTED), that means it is always valid (not
NULL). It makes the verifier fail to validate programs, and may cause
a kernel crash when running these programs.

This patch marks the second argument of .dispatch with
PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED in
bpf_scx_is_valid_access(). The verifier  will ensures the programs
always check if the argument is NULL before reading the pointed memory.
---
 kernel/sched/ext.c | 65 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4ae373743911a..ac813e870999f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3666,16 +3666,76 @@ const struct file_operations sched_ext_fops = {
 
 extern struct btf *btf_vmlinux;
 static const struct btf_type *task_struct_type;
+static u32 task_struct_type_id;
+
+/* Make the 2nd argument of .dispatch a pointer that can be NULL. */
+static bool promote_dispatch_2nd_arg(int off, int size,
+                                     enum bpf_access_type type,
+                                     const struct bpf_prog *prog,
+                                     struct bpf_insn_access_aux *info)
+{
+	const struct bpf_struct_ops *st_ops;
+	const struct btf_member *member;
+        const struct btf_type *t;
+        u32 btf_id, member_idx;
+	const char *mname;
+
+        /* btf_id should be the type id of struct sched_ext_ops */
+	btf_id = prog->aux->attach_btf_id;
+	st_ops = bpf_struct_ops_find(btf_id);
+	if (!st_ops)
+                return false;
+
+        /* BTF type of struct sched_ext_ops */
+        t = st_ops->type;
+
+	member_idx = prog->expected_attach_type;
+	if (member_idx >= btf_type_vlen(t))
+                return false;
+
+        /* Get the member name of this program.  For example, the
+         * member name of the dispatch program is "dispatch".
+         */
+	member = &btf_type_member(t)[member_idx];
+	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
+
+        /* Chkeck if it is the 2nd argument of the function pointer at
+         * "dispatch" in struct sched_ext_ops. The arguments of
+         * struct_ops operators are placed in the context one after
+         * another. And, they are 64-bits. So, the 2nd argument is at
+         * offset sizeof(__u64).
+         */
+        if (strcmp(mname, "dispatch") == 0 &&
+            off == sizeof(__u64)) {
+                /* The value is a pointer to a type (struct
+                 * task_struct) given by a BTF ID (PTR_TO_BTF_ID). It
+                 * is tursted (PTR_TRUSTED), however, can be a NULL
+                 * (PTR_MAYBE_NULL).  The BPF program should check the
+                 * pointer to make sure it is not null before using
+                 * it, or the verifier will reject the program.
+                 */
+                info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID |
+                  PTR_TRUSTED;
+                info->btf = btf_vmlinux;
+                info->btf_id = task_struct_type_id;
+
+                return true;
+        }
+
+        return false;
+}
 
 static bool bpf_scx_is_valid_access(int off, int size,
 				    enum bpf_access_type type,
 				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
 {
-	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
-		return false;
 	if (type != BPF_READ)
 		return false;
+        if (promote_dispatch_2nd_arg(off, size, type, prog, info))
+                return true;
+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+		return false;
 	if (off % size != 0)
 		return false;
 
@@ -3804,6 +3864,7 @@ static int bpf_scx_init(struct btf *btf)
 	if (type_id < 0)
 		return -EINVAL;
 	task_struct_type = btf_type_by_id(btf, type_id);
+        task_struct_type_id = type_id;
 
 	return 0;
 }

From b21b258aa00bb42f13e4ae33d6ca817c6b3cb3cc Mon Sep 17 00:00:00 2001
From: Thinker Lee <thinker.li@gmail.com>
Date: Tue, 16 Jan 2024 18:05:12 -0800
Subject: [PATCH 263/304] selftests/scx: Check if MAYBE_NULL works for the 2nd
 argument of .dispatch.

Check if the verifier can catch the invalid access if a .dispatch
program doesn't check the 2nd argument before accessing the pointed
memory. Also check if the verifier allows a program which check the
2nd argument before accessing the pointed memory.
---
 tools/testing/selftests/scx/Makefile          |  5 ++-
 tools/testing/selftests/scx/maybe_null.bpf.c  | 26 ++++++++++++
 tools/testing/selftests/scx/maybe_null.c      | 42 +++++++++++++++++++
 .../selftests/scx/maybe_null_fail.bpf.c       | 25 +++++++++++
 tools/testing/selftests/scx/runner.c          |  2 +
 5 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/scx/maybe_null.bpf.c
 create mode 100644 tools/testing/selftests/scx/maybe_null.c
 create mode 100644 tools/testing/selftests/scx/maybe_null_fail.bpf.c

diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index c3bf6c19dccf0..8d10a11c3751a 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -161,6 +161,7 @@ auto-test-targets :=			\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
 	init_enable_count		\
+	maybe_null			\
 	minimal				\
 	select_cpu_dfl			\
 	select_cpu_dfl_nodispatch	\
@@ -175,6 +176,8 @@ testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-target
 $(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
 	$(CC) $(CFLAGS) -c $< -o $@
 
+$(SCXOBJ_DIR)/maybe_null.o: $(INCLUDE_DIR)/maybe_null_fail.bpf.skel.h
+
 # Create all of the test targets object files, whose testcase objects will be
 # registered into the runner in ELF constructors.
 #
@@ -188,7 +191,7 @@ $(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $$(if $$(wild
 
 runner: $(SCXOBJ_DIR)/runner.o $(BPFOBJ) $(testcase-targets)
 	@echo "$(testcase-targets)"
-	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^
+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
 TEST_GEN_PROGS := runner
 
diff --git a/tools/testing/selftests/scx/maybe_null.bpf.c b/tools/testing/selftests/scx/maybe_null.bpf.c
new file mode 100644
index 0000000000000..d45a77281c053
--- /dev/null
+++ b/tools/testing/selftests/scx/maybe_null.bpf.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 vtime_test;
+
+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
+{}
+
+void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p)
+{
+        if (p != NULL)
+          vtime_test = p->scx.dsq_vtime;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops maybe_null_success = {
+        .dispatch               = maybe_null_success_dispatch,
+	.enable			= maybe_null_running,
+	.name			= "minimal",
+};
diff --git a/tools/testing/selftests/scx/maybe_null.c b/tools/testing/selftests/scx/maybe_null.c
new file mode 100644
index 0000000000000..578820d21685b
--- /dev/null
+++ b/tools/testing/selftests/scx/maybe_null.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "maybe_null.bpf.skel.h"
+#include "maybe_null_fail.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status run(void *ctx)
+{
+        struct maybe_null *skel;
+        struct maybe_null_fail *fail_skel;
+
+        skel = maybe_null__open_and_load();
+        if (!skel) {
+                SCX_ERR("Failed to open and load maybe_null skel");
+                return SCX_TEST_FAIL;
+        }
+        maybe_null__destroy(skel);
+
+        fail_skel = maybe_null_fail__open_and_load();
+        if (fail_skel) {
+                maybe_null_fail__destroy(fail_skel);
+                SCX_ERR("Should failed to open and load maybe_null_fail skel");
+                return SCX_TEST_FAIL;
+        }
+
+	return SCX_TEST_PASS;
+}
+
+struct scx_test maybe_null = {
+	.name = "maybe_null",
+	.description = "Verify if PTR_MAYBE_NULL work for .dispatch",
+	.run = run,
+};
+REGISTER_SCX_TEST(&maybe_null)
diff --git a/tools/testing/selftests/scx/maybe_null_fail.bpf.c b/tools/testing/selftests/scx/maybe_null_fail.bpf.c
new file mode 100644
index 0000000000000..c09a5b7ecf20f
--- /dev/null
+++ b/tools/testing/selftests/scx/maybe_null_fail.bpf.c
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+u64 vtime_test;
+
+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
+{}
+
+void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)
+{
+          vtime_test = p->scx.dsq_vtime;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops maybe_null_fail = {
+        .dispatch               = maybe_null_fail_dispatch,
+	.enable			= maybe_null_running,
+	.name			= "minimal",
+};
diff --git a/tools/testing/selftests/scx/runner.c b/tools/testing/selftests/scx/runner.c
index cfb57f6a00ad5..17262c30b96de 100644
--- a/tools/testing/selftests/scx/runner.c
+++ b/tools/testing/selftests/scx/runner.c
@@ -55,6 +55,8 @@ static const char *status_to_result(enum scx_test_status status)
 	case SCX_TEST_FAIL:
 		return "not ok";
 	}
+
+        return NULL;
 }
 
 static void print_test_result(const struct scx_test *test,

From e7a7781b485d4d8989e7312b69bb04a53ad495a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 18 Jan 2024 07:58:53 -1000
Subject: [PATCH 264/304] scx: Add /sys/kernel/sched_ext interface

/sys/kernel/debug/sched/ext is the current interface file which can be used
to determine the current state of scx. This is problematic in that it's
dependent on CONFIG_SCHED_DEBUG. On kernels which don't have the option
enabled, there is no easy way to tell whether scx is currently in use.

Let's add a new kobject based interface which is created under
/sys/kernel/sched_ext. The directory contains:

- System level interface files. As it's now a non-debug interface, confine
  the exposed files to "state", "switch_all" and "nr_rejected".

- Per-scheduler directory which currently only contains "ops". The directory
  is always named "root" for now. This is in preparation of the future where
  there can be multiple schedulers loaded in a system. Loading and unloading
  of a scheduler also generates a uevent with SCXOPS attribute.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst |  16 ++--
 kernel/sched/build_policy.c           |   1 +
 kernel/sched/ext.c                    | 124 +++++++++++++++++++++++++-
 3 files changed, 130 insertions(+), 11 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 3e1e0a4e974d7..b49f3f94a6cba 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -51,17 +51,17 @@ BPF scheduler and reverts all tasks back to CFS.
     local=17 global=72
     ^CEXIT: BPF scheduler unregistered
 
-If ``CONFIG_SCHED_DEBUG`` is set, the current status of the BPF scheduler
-and whether a given task is on sched_ext can be determined as follows:
+The current status of the BPF scheduler can be determined as follows:
 
 .. code-block:: none
 
-    # cat /sys/kernel/debug/sched/ext
-    ops                           : simple
-    enabled                       : 1
-    switching_all                 : 1
-    switched_all                  : 1
-    enable_state                  : enabled
+    # cat /sys/kernel/sched_ext/state
+    enabled
+    # cat /sys/kernel/sched_ext/root/ops
+    simple
+
+If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
+be determined as follows:
 
     # grep ext /proc/self/sched
     ext.enabled                                  :                    1
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 96ea08f76603a..392c91667767d 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -21,6 +21,7 @@
 
 #include <linux/cpuidle.h>
 #include <linux/jiffies.h>
+#include <linux/kobject.h>
 #include <linux/livepatch.h>
 #include <linux/pm.h>
 #include <linux/psi.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4ae373743911a..65c1bdf5b9637 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -193,6 +193,10 @@ struct scx_dsp_ctx {
 
 static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
 
+/* /sys/kernel/sched_ext interface */
+static struct kset *scx_kset;
+static struct kobject *scx_root_kobj;
+
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 		      u64 enq_flags);
 void scx_bpf_kick_cpu(s32 cpu, u64 flags);
@@ -3030,6 +3034,83 @@ static int scx_cgroup_init(void) { return 0; }
 static void scx_cgroup_config_knobs(void) {}
 #endif
 
+
+/********************************************************************************
+ * Sysfs interface and ops enable/disable.
+ */
+
+#define SCX_ATTR(_name)								\
+	static struct kobj_attribute scx_attr_##_name = {			\
+		.attr = { .name = __stringify(_name), .mode = 0444 },		\
+		.show = scx_attr_##_name##_show,				\
+	}
+
+static ssize_t scx_attr_state_show(struct kobject *kobj,
+				   struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%s\n",
+			  scx_ops_enable_state_str[scx_ops_enable_state()]);
+}
+SCX_ATTR(state);
+
+static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
+					struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
+}
+SCX_ATTR(switch_all);
+
+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
+					 struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
+}
+SCX_ATTR(nr_rejected);
+
+static struct attribute *scx_global_attrs[] = {
+	&scx_attr_state.attr,
+	&scx_attr_switch_all.attr,
+	&scx_attr_nr_rejected.attr,
+	NULL,
+};
+
+static const struct attribute_group scx_global_attr_group = {
+	.attrs = scx_global_attrs,
+};
+
+static void scx_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static ssize_t scx_attr_ops_show(struct kobject *kobj,
+				 struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", scx_ops.name);
+}
+SCX_ATTR(ops);
+
+static struct attribute *scx_sched_attrs[] = {
+	&scx_attr_ops.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(scx_sched);
+
+static const struct kobj_type scx_ktype = {
+	.release = scx_kobj_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = scx_sched_groups,
+};
+
+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+	return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+}
+
+static const struct kset_uevent_ops scx_uevent_ops = {
+	.uevent = scx_uevent,
+};
+
 /*
  * Used by sched_fork() and __setscheduler_prio() to pick the matching
  * sched_class. dl/rt are already handled.
@@ -3264,6 +3345,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	if (scx_ops.exit)
 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
 
+	kobject_del(scx_root_kobj);
+	scx_root_kobj = NULL;
+
 	memset(&scx_ops, 0, sizeof(scx_ops));
 
 	rhashtable_walk_enter(&dsq_hash, &rht_iter);
@@ -3390,6 +3474,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err;
 	}
 
+	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
+	if (!scx_root_kobj) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	scx_root_kobj->kset = scx_kset;
+	ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
+	if (ret < 0)
+		goto err;
+
 	/*
 	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
 	 * disable path. Failure triggers full disabling from here on.
@@ -3603,6 +3698,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	if (scx_switch_all_req)
 		static_branch_enable(&__scx_switched_all);
 
+	kobject_uevent(scx_root_kobj, KOBJ_ADD);
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
@@ -3610,6 +3706,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	return 0;
 
 err:
+	kfree(scx_root_kobj);
+	scx_root_kobj = NULL;
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
@@ -3657,6 +3755,7 @@ const struct file_operations sched_ext_fops = {
 };
 #endif
 
+
 /********************************************************************************
  * bpf_struct_ops plumbing.
  */
@@ -3840,6 +3939,11 @@ struct bpf_struct_ops bpf_sched_ext_ops = {
 	.name = "sched_ext_ops",
 };
 
+
+/********************************************************************************
+ * System integration and init.
+ */
+
 static void sysrq_handle_sched_ext_reset(u8 key)
 {
 	if (scx_ops_helper)
@@ -3939,7 +4043,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms",
 			  (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC));
 
-	/* Print everything onto one line to conserve console spce. */
+	/* print everything onto one line to conserve console space */
 	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
 	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
 	       runnable_at_buf);
@@ -4751,8 +4855,22 @@ static int __init scx_init(void)
 	}
 
 	ret = register_pm_notifier(&scx_pm_notifier);
-	if (ret)
-		pr_warn("sched_ext: Failed to register PM notifier (%d)\n", ret);
+	if (ret) {
+		pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
+		return ret;
+	}
+
+	scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
+	if (!scx_kset) {
+		pr_err("sched_ext: Failed to create /sys/sched_ext\n");
+		return -ENOMEM;
+	}
+
+	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
+	if (ret < 0) {
+		pr_err("sched_ext: Failed to add global attributes\n");
+		return ret;
+	}
 
 	return 0;
 }

From a1392ed1201b17937ee70916cd5537e846ebfbc5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 18 Jan 2024 07:58:54 -1000
Subject: [PATCH 265/304] scx: Replace /sys/kernel/debug/sched/ext with
 tools/sched_ext/scx_show_state.py

Now that the state is visible through /sys/kernel/sched_ext,
/sys/kernel/debug/sched/ext isn't needed to determine the current state of
scx. However, /sys/kernel/sched_ext shows only a subset of information that
was available in the debug interface and it can be useful to have access to
the rest for debugging. Remove /sys/kernel/debug/sched/ext and add the drgn
script, tools/sched_ext/scx_show_state.py, which shows the same information.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 16 +++++++++++
 kernel/sched/debug.c                  |  3 ---
 kernel/sched/ext.c                    | 31 ---------------------
 kernel/sched/ext.h                    |  1 -
 tools/sched_ext/scx_show_state.py     | 39 +++++++++++++++++++++++++++
 5 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 tools/sched_ext/scx_show_state.py

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index b49f3f94a6cba..e3847b44e17c3 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -60,9 +60,25 @@ The current status of the BPF scheduler can be determined as follows:
     # cat /sys/kernel/sched_ext/root/ops
     simple
 
+``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
+detailed information:
+
+.. code-block:: none
+
+    # tools/sched_ext/scx_show_state.py
+    ops           : simple
+    enabled       : 1
+    switching_all : 1
+    switched_all  : 1
+    enable_state  : enabled (2)
+    bypass_depth  : 0
+    nr_rejected   : 0
+
 If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
 be determined as follows:
 
+.. code-block:: none
+
     # grep ext /proc/self/sched
     ext.enabled                                  :                    1
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6587a45ffe966..c5ee001d3459b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -374,9 +374,6 @@ static __init int sched_init_debug(void)
 
 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
 
-#ifdef CONFIG_SCHED_CLASS_EXT
-	debugfs_create_file("ext", 0444, debugfs_sched, NULL, &sched_ext_fops);
-#endif
 	return 0;
 }
 late_initcall(sched_init_debug);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 65c1bdf5b9637..b89107c8288dc 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3724,37 +3724,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	return ret;
 }
 
-#ifdef CONFIG_SCHED_DEBUG
-static int scx_debug_show(struct seq_file *m, void *v)
-{
-	mutex_lock(&scx_ops_enable_mutex);
-	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
-	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
-	seq_printf(m, "%-30s: %d\n", "switching_all",
-		   READ_ONCE(scx_switching_all));
-	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
-	seq_printf(m, "%-30s: %s\n", "enable_state",
-		   scx_ops_enable_state_str[scx_ops_enable_state()]);
-	seq_printf(m, "%-30s: %d\n", "bypassing", scx_ops_bypassing());
-	seq_printf(m, "%-30s: %lu\n", "nr_rejected",
-		   atomic_long_read(&scx_nr_rejected));
-	mutex_unlock(&scx_ops_enable_mutex);
-	return 0;
-}
-
-static int scx_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, scx_debug_show, NULL);
-}
-
-const struct file_operations sched_ext_fops = {
-	.open		= scx_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
 
 /********************************************************************************
  * bpf_struct_ops plumbing.
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 3055efbfaf526..3aa6598ad2312 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -103,7 +103,6 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
 
 extern const struct sched_class ext_sched_class;
 extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
-extern const struct file_operations sched_ext_fops;
 extern unsigned long scx_watchdog_timeout;
 extern unsigned long scx_watchdog_timestamp;
 
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
new file mode 100644
index 0000000000000..d457d2a74e1ef
--- /dev/null
+++ b/tools/sched_ext/scx_show_state.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
+# Copyright (C) 2024 Meta Platforms, Inc. and affiliates.
+
+desc = """
+This is a drgn script to show the current sched_ext state.
+For more info on drgn, visit https://github.com/osandov/drgn.
+"""
+
+import drgn
+import sys
+
+def err(s):
+    print(s, file=sys.stderr, flush=True)
+    sys.exit(1)
+
+def read_int(name):
+    return int(prog[name].value_())
+
+def read_atomic(name):
+    return prog[name].counter.value_()
+
+def read_static_key(name):
+    return prog[name].key.enabled.counter.value_()
+
+def ops_state_str(state):
+    return prog['scx_ops_enable_state_str'][state].string_().decode()
+
+ops = prog['scx_ops']
+enable_state = read_atomic("scx_ops_enable_state_var")
+
+print(f'ops           : {ops.name.string_().decode()}')
+print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
+print(f'switching_all : {read_int("scx_switching_all")}')
+print(f'switched_all  : {read_static_key("__scx_switched_all")}')
+print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
+print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
+print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')

From a141212211d70a278658ce22eefa048b0c750fb7 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Fri, 19 Jan 2024 18:10:28 -0600
Subject: [PATCH 266/304] scx: Fix a couple follow ups to recent struct_ops
 changes

- Fix a few typos and some comment formatting in ext.c
- Generalize the rule for compiling a "fail" testcase variant in
  seltests
- Update copyrights to 2024

Signed-off-by: David Vernet <void@manifault.com>
---
 kernel/sched/ext.c                            | 31 +++++++++++--------
 tools/testing/selftests/scx/Makefile          |  7 +++--
 tools/testing/selftests/scx/maybe_null.bpf.c  |  2 +-
 tools/testing/selftests/scx/maybe_null.c      |  4 +--
 .../selftests/scx/maybe_null_fail.bpf.c       |  2 +-
 5 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 19bed179b6b84..52457f65aa572 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3761,26 +3761,31 @@ static bool promote_dispatch_2nd_arg(int off, int size,
 	if (member_idx >= btf_type_vlen(t))
                 return false;
 
-        /* Get the member name of this program.  For example, the
-         * member name of the dispatch program is "dispatch".
+        /*
+	 * Get the member name of this struct_ops program, which corresponds to
+	 * a field in struct sched_ext_ops. For example, the member name of the
+	 * dispatch struct_ops program (callback) is "dispatch".
          */
 	member = &btf_type_member(t)[member_idx];
 	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
 
-        /* Chkeck if it is the 2nd argument of the function pointer at
-         * "dispatch" in struct sched_ext_ops. The arguments of
-         * struct_ops operators are placed in the context one after
-         * another. And, they are 64-bits. So, the 2nd argument is at
-         * offset sizeof(__u64).
+        /*
+	 * Check if it is the second argument of the function pointer at
+	 * "dispatch" in struct sched_ext_ops. The arguments of struct_ops
+	 * operators are sequential and 64-bit, so the second argument is at
+	 * offset sizeof(__u64).
          */
         if (strcmp(mname, "dispatch") == 0 &&
             off == sizeof(__u64)) {
-                /* The value is a pointer to a type (struct
-                 * task_struct) given by a BTF ID (PTR_TO_BTF_ID). It
-                 * is tursted (PTR_TRUSTED), however, can be a NULL
-                 * (PTR_MAYBE_NULL).  The BPF program should check the
-                 * pointer to make sure it is not null before using
-                 * it, or the verifier will reject the program.
+                /*
+		 * The value is a pointer to a type (struct task_struct) given
+		 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
+		 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
+		 * should check the pointer to make sure it is not NULL before
+		 * using it, or the verifier will reject the program.
+		 *
+		 * Longer term, this is something that should be addressed by
+		 * BTF, and be fully contained within the verifier.
                  */
                 info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID |
                   PTR_TRUSTED;
diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile
index 8d10a11c3751a..e7ec3397bcc5b 100644
--- a/tools/testing/selftests/scx/Makefile
+++ b/tools/testing/selftests/scx/Makefile
@@ -176,8 +176,6 @@ testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-target
 $(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
 	$(CC) $(CFLAGS) -c $< -o $@
 
-$(SCXOBJ_DIR)/maybe_null.o: $(INCLUDE_DIR)/maybe_null_fail.bpf.skel.h
-
 # Create all of the test targets object files, whose testcase objects will be
 # registered into the runner in ELF constructors.
 #
@@ -185,7 +183,10 @@ $(SCXOBJ_DIR)/maybe_null.o: $(INCLUDE_DIR)/maybe_null_fail.bpf.skel.h
 # compiling BPF object files only if one is present, as the wildcard Make
 # function doesn't support using implicit rules otherwise.
 .SECONDEXPANSION:
-$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $$(if $$(wildcard $$*.bpf.c), $(INCLUDE_DIR)/%.bpf.skel.h) | $(SCXOBJ_DIR)
+$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o		\
+	$$(if $$(wildcard $$*.bpf.c), $(INCLUDE_DIR)/%.bpf.skel.h)		\
+	$$(if $$(wildcard $$*_fail.bpf.c), $(INCLUDE_DIR)/%_fail.bpf.skel.h)	\
+	| $(SCXOBJ_DIR)
 	$(eval test=$(patsubst %.o,%.c,$(notdir $@)))
 	$(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o
 
diff --git a/tools/testing/selftests/scx/maybe_null.bpf.c b/tools/testing/selftests/scx/maybe_null.bpf.c
index d45a77281c053..1e9b1fdedc88a 100644
--- a/tools/testing/selftests/scx/maybe_null.bpf.c
+++ b/tools/testing/selftests/scx/maybe_null.bpf.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
  */
 
 #include <scx/common.bpf.h>
diff --git a/tools/testing/selftests/scx/maybe_null.c b/tools/testing/selftests/scx/maybe_null.c
index 578820d21685b..4f093a5ee4de8 100644
--- a/tools/testing/selftests/scx/maybe_null.c
+++ b/tools/testing/selftests/scx/maybe_null.c
@@ -1,8 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2023 David Vernet <dvernet@meta.com>
- * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
  */
 #include <bpf/bpf.h>
 #include <scx/common.h>
diff --git a/tools/testing/selftests/scx/maybe_null_fail.bpf.c b/tools/testing/selftests/scx/maybe_null_fail.bpf.c
index c09a5b7ecf20f..bc99c13ce5839 100644
--- a/tools/testing/selftests/scx/maybe_null_fail.bpf.c
+++ b/tools/testing/selftests/scx/maybe_null_fail.bpf.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
  */
 
 #include <scx/common.bpf.h>

From f6c30bfe5a49bc38cae985083a11016800708fea Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 6 Nov 2023 20:44:34 -0800
Subject: [PATCH 267/304] f2fs: explicitly null-terminate the xattr list

commit e26b6d39270f5eab0087453d9b544189a38c8564 upstream.

When setting an xattr, explicitly null-terminate the xattr list.  This
eliminates the fragile assumption that the unused xattr space is always
zeroed.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/xattr.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 47e88b4d4e7d0..a8fc2cac68799 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -754,6 +754,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 		memcpy(pval, value, size);
 		last->e_value_size = cpu_to_le16(size);
 		new_hsize += newsize;
+		/*
+		 * Explicitly add the null terminator.  The unused xattr space
+		 * is supposed to always be zeroed, which would make this
+		 * unnecessary, but don't depend on that.
+		 */
+		*(u32 *)((u8 *)last + newsize) = 0;
 	}
 
 	error = write_all_xattrs(inode, new_hsize, base_addr, ipage);

From ba4593e47982b71a6a49d6120c938395bb2f1543 Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 21 Dec 2023 13:25:18 +0000
Subject: [PATCH 268/304] ALSA: hda/realtek: Add quirks for Dell models

commit 423206604b28174698d77bf5ea81365cdd6c0f77 upstream.

These models use 2 or 4 CS35L41 amps with HDA using SPI and I2C.
Models use internal and external boost.
All models require DSD support to be added inside
cs35l41_hda_property.c

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Cc: <stable@vger.kernel.org> # v6.7+
Link: https://lore.kernel.org/r/20231221132518.3213-4-sbinding@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 70b17b08d4ffa..942722a04e80f 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6956,6 +6956,11 @@ static void cs35l41_fixup_i2c_two(struct hda_codec *cdc, const struct hda_fixup
 	cs35l41_generic_fixup(cdc, action, "i2c", "CSC3551", 2);
 }
 
+static void cs35l41_fixup_i2c_four(struct hda_codec *cdc, const struct hda_fixup *fix, int action)
+{
+	cs35l41_generic_fixup(cdc, action, "i2c", "CSC3551", 4);
+}
+
 static void cs35l41_fixup_spi_two(struct hda_codec *codec, const struct hda_fixup *fix, int action)
 {
 	cs35l41_generic_fixup(codec, action, "spi", "CSC3551", 2);
@@ -7441,6 +7446,7 @@ enum {
 	ALC287_FIXUP_LEGION_16ACHG6,
 	ALC287_FIXUP_CS35L41_I2C_2,
 	ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED,
+	ALC287_FIXUP_CS35L41_I2C_4,
 	ALC245_FIXUP_CS35L41_SPI_2,
 	ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED,
 	ALC245_FIXUP_CS35L41_SPI_4,
@@ -9427,6 +9433,10 @@ static const struct hda_fixup alc269_fixups[] = {
 		.chained = true,
 		.chain_id = ALC285_FIXUP_HP_MUTE_LED,
 	},
+	[ALC287_FIXUP_CS35L41_I2C_4] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = cs35l41_fixup_i2c_four,
+	},
 	[ALC245_FIXUP_CS35L41_SPI_2] = {
 		.type = HDA_FIXUP_FUNC,
 		.v.func = cs35l41_fixup_spi_two,
@@ -9703,6 +9713,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x0a9e, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1028, 0x0b19, "Dell XPS 15 9520", ALC289_FIXUP_DUAL_SPK),
 	SND_PCI_QUIRK(0x1028, 0x0b1a, "Dell Precision 5570", ALC289_FIXUP_DUAL_SPK),
+	SND_PCI_QUIRK(0x1028, 0x0b27, "Dell", ALC245_FIXUP_CS35L41_SPI_2),
+	SND_PCI_QUIRK(0x1028, 0x0b28, "Dell", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1028, 0x0b37, "Dell Inspiron 16 Plus 7620 2-in-1", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS),
 	SND_PCI_QUIRK(0x1028, 0x0b71, "Dell Inspiron 16 Plus 7620", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS),
 	SND_PCI_QUIRK(0x1028, 0x0beb, "Dell XPS 15 9530 (2023)", ALC289_FIXUP_DELL_CS35L41_SPI_2),
@@ -9713,6 +9725,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x0c1c, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS),
 	SND_PCI_QUIRK(0x1028, 0x0c1d, "Dell Precision 3440", ALC236_FIXUP_DELL_DUAL_CODECS),
 	SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS),
+	SND_PCI_QUIRK(0x1028, 0x0c4d, "Dell", ALC287_FIXUP_CS35L41_I2C_4),
 	SND_PCI_QUIRK(0x1028, 0x0cbd, "Dell Oasis 13 CS MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1028, 0x0cbe, "Dell Oasis 13 2-IN-1 MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1028, 0x0cbf, "Dell Oasis 13 Low Weight MTU-L", ALC289_FIXUP_DELL_CS35L41_SPI_2),

From fdcf3681347c3a42762428c86fa087b787e9b1b2 Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 21 Dec 2023 13:25:16 +0000
Subject: [PATCH 269/304] ALSA: hda: cs35l41: Support additional Dell models
 without _DSD

commit ee694e7db47e1af00ffb29f569904a9ed576868f upstream.

Add new model entries into configuration table.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Cc: <stable@vger.kernel.org> # v6.7+
Link: https://lore.kernel.org/r/20231221132518.3213-2-sbinding@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/cs35l41_hda_property.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c
index c1afb721b4c67..cbc9db5a3a8df 100644
--- a/sound/pci/hda/cs35l41_hda_property.c
+++ b/sound/pci/hda/cs35l41_hda_property.c
@@ -35,6 +35,10 @@ struct cs35l41_config {
 };
 
 static const struct cs35l41_config cs35l41_config_table[] = {
+	{ "10280B27", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10280B28", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10280BEB", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 },
+	{ "10280C4D", I2C, 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 1, -1, 1000, 4500, 24 },
 /*
  * Device 103C89C6 does have _DSD, however it is setup to use the wrong boost type.
  * We can override the _DSD to correct the boost type here.
@@ -345,6 +349,10 @@ struct cs35l41_prop_model {
 static const struct cs35l41_prop_model cs35l41_prop_model_table[] = {
 	{ "CLSA0100", NULL, lenovo_legion_no_acpi },
 	{ "CLSA0101", NULL, lenovo_legion_no_acpi },
+	{ "CSC3551", "10280B27", generic_dsd_config },
+	{ "CSC3551", "10280B28", generic_dsd_config },
+	{ "CSC3551", "10280BEB", generic_dsd_config },
+	{ "CSC3551", "10280C4D", generic_dsd_config },
 	{ "CSC3551", "103C89C6", generic_dsd_config },
 	{ "CSC3551", "104312AF", generic_dsd_config },
 	{ "CSC3551", "10431433", generic_dsd_config },

From 40a3bf7824062928596816dff1fd9a457974335b Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 21 Dec 2023 13:25:17 +0000
Subject: [PATCH 270/304] ALSA: hda: cs35l41: Prevent firmware load if SPI
 speed too low

commit d110858a6925827609d11db8513d76750483ec06 upstream.

Some laptops without _DSD have the SPI speed set very low in the BIOS.
Since the SPI controller uses this speed as its max speed, the SPI
transactions are very slow. Firmware download writes to many registers,
and if the SPI speed is too slow, it can take a long time to download.
For this reason, disable firmware loading if the maximum SPI speed is
too low. Without Firmware, audio playback will work, but the volume
will be low to ensure safe operation of the CS35L41.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Cc: <stable@vger.kernel.org> # v6.7+
Link: https://lore.kernel.org/r/20231221132518.3213-3-sbinding@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/cs35l41_hda.c          | 25 ++++++++--
 sound/pci/hda/cs35l41_hda.h          | 12 ++++-
 sound/pci/hda/cs35l41_hda_i2c.c      |  2 +-
 sound/pci/hda/cs35l41_hda_property.c | 74 +++++++++++++---------------
 sound/pci/hda/cs35l41_hda_spi.c      |  2 +-
 5 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index 92ca2b3b6c924..d3fa6e136744d 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -12,6 +12,7 @@
 #include <sound/hda_codec.h>
 #include <sound/soc.h>
 #include <linux/pm_runtime.h>
+#include <linux/spi/spi.h>
 #include "hda_local.h"
 #include "hda_auto_parser.h"
 #include "hda_jack.h"
@@ -996,6 +997,11 @@ static int cs35l41_smart_amp(struct cs35l41_hda *cs35l41)
 	__be32 halo_sts;
 	int ret;
 
+	if (cs35l41->bypass_fw) {
+		dev_warn(cs35l41->dev, "Bypassing Firmware.\n");
+		return 0;
+	}
+
 	ret = cs35l41_init_dsp(cs35l41);
 	if (ret) {
 		dev_warn(cs35l41->dev, "Cannot Initialize Firmware. Error: %d\n", ret);
@@ -1588,6 +1594,7 @@ static int cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, const char *hid, i
 	u32 values[HDA_MAX_COMPONENTS];
 	struct acpi_device *adev;
 	struct device *physdev;
+	struct spi_device *spi;
 	const char *sub;
 	char *property;
 	size_t nval;
@@ -1610,7 +1617,7 @@ static int cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, const char *hid, i
 	ret = cs35l41_add_dsd_properties(cs35l41, physdev, id, hid);
 	if (!ret) {
 		dev_info(cs35l41->dev, "Using extra _DSD properties, bypassing _DSD in ACPI\n");
-		goto put_physdev;
+		goto out;
 	}
 
 	property = "cirrus,dev-index";
@@ -1701,8 +1708,20 @@ static int cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, const char *hid, i
 		hw_cfg->bst_type = CS35L41_EXT_BOOST;
 
 	hw_cfg->valid = true;
+out:
 	put_device(physdev);
 
+	cs35l41->bypass_fw = false;
+	if (cs35l41->control_bus == SPI) {
+		spi = to_spi_device(cs35l41->dev);
+		if (spi->max_speed_hz < CS35L41_MAX_ACCEPTABLE_SPI_SPEED_HZ) {
+			dev_warn(cs35l41->dev,
+				 "SPI speed is too slow to support firmware download: %d Hz.\n",
+				 spi->max_speed_hz);
+			cs35l41->bypass_fw = true;
+		}
+	}
+
 	return 0;
 
 err:
@@ -1711,14 +1730,13 @@ static int cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, const char *hid, i
 	hw_cfg->gpio1.valid = false;
 	hw_cfg->gpio2.valid = false;
 	acpi_dev_put(cs35l41->dacpi);
-put_physdev:
 	put_device(physdev);
 
 	return ret;
 }
 
 int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int irq,
-		      struct regmap *regmap)
+		      struct regmap *regmap, enum control_bus control_bus)
 {
 	unsigned int regid, reg_revid;
 	struct cs35l41_hda *cs35l41;
@@ -1737,6 +1755,7 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i
 	cs35l41->dev = dev;
 	cs35l41->irq = irq;
 	cs35l41->regmap = regmap;
+	cs35l41->control_bus = control_bus;
 	dev_set_drvdata(dev, cs35l41);
 
 	ret = cs35l41_hda_read_acpi(cs35l41, device_name, id);
diff --git a/sound/pci/hda/cs35l41_hda.h b/sound/pci/hda/cs35l41_hda.h
index 3d925d677213d..43d55292b327a 100644
--- a/sound/pci/hda/cs35l41_hda.h
+++ b/sound/pci/hda/cs35l41_hda.h
@@ -20,6 +20,8 @@
 #include <linux/firmware/cirrus/cs_dsp.h>
 #include <linux/firmware/cirrus/wmfw.h>
 
+#define CS35L41_MAX_ACCEPTABLE_SPI_SPEED_HZ	1000000
+
 struct cs35l41_amp_cal_data {
 	u32 calTarget[2];
 	u32 calTime[2];
@@ -46,6 +48,11 @@ enum cs35l41_hda_gpio_function {
 	CS35l41_SYNC,
 };
 
+enum control_bus {
+	I2C,
+	SPI
+};
+
 struct cs35l41_hda {
 	struct device *dev;
 	struct regmap *regmap;
@@ -74,6 +81,9 @@ struct cs35l41_hda {
 	struct cs_dsp cs_dsp;
 	struct acpi_device *dacpi;
 	bool mute_override;
+	enum control_bus control_bus;
+	bool bypass_fw;
+
 };
 
 enum halo_state {
@@ -85,7 +95,7 @@ enum halo_state {
 extern const struct dev_pm_ops cs35l41_hda_pm_ops;
 
 int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int irq,
-		      struct regmap *regmap);
+		      struct regmap *regmap, enum control_bus control_bus);
 void cs35l41_hda_remove(struct device *dev);
 int cs35l41_get_speaker_id(struct device *dev, int amp_index, int num_amps, int fixed_gpio_id);
 
diff --git a/sound/pci/hda/cs35l41_hda_i2c.c b/sound/pci/hda/cs35l41_hda_i2c.c
index b44536fbba17d..603e9bff3a71d 100644
--- a/sound/pci/hda/cs35l41_hda_i2c.c
+++ b/sound/pci/hda/cs35l41_hda_i2c.c
@@ -30,7 +30,7 @@ static int cs35l41_hda_i2c_probe(struct i2c_client *clt)
 		return -ENODEV;
 
 	return cs35l41_hda_probe(&clt->dev, device_name, clt->addr, clt->irq,
-				 devm_regmap_init_i2c(clt, &cs35l41_regmap_i2c));
+				 devm_regmap_init_i2c(clt, &cs35l41_regmap_i2c), I2C);
 }
 
 static void cs35l41_hda_i2c_remove(struct i2c_client *clt)
diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c
index cbc9db5a3a8df..fb15fa21abf44 100644
--- a/sound/pci/hda/cs35l41_hda_property.c
+++ b/sound/pci/hda/cs35l41_hda_property.c
@@ -16,10 +16,6 @@
 
 struct cs35l41_config {
 	const char *ssid;
-	enum {
-		SPI,
-		I2C
-	} bus;
 	int num_amps;
 	enum {
 		INTERNAL,
@@ -35,46 +31,46 @@ struct cs35l41_config {
 };
 
 static const struct cs35l41_config cs35l41_config_table[] = {
-	{ "10280B27", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10280B28", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10280BEB", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 },
-	{ "10280C4D", I2C, 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10280B27", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10280B28", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10280BEB", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 },
+	{ "10280C4D", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 1, -1, 1000, 4500, 24 },
 /*
  * Device 103C89C6 does have _DSD, however it is setup to use the wrong boost type.
  * We can override the _DSD to correct the boost type here.
  * Since this laptop has valid ACPI, we do not need to handle cs-gpios, since that already exists
  * in the ACPI. The Reset GPIO is also valid, so we can use the Reset defined in _DSD.
  */
-	{ "103C89C6", SPI, 2, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, -1, -1, -1, 1000, 4500, 24 },
-	{ "104312AF", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431433", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431463", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431473", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 1000, 4500, 24 },
-	{ "10431483", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 1000, 4500, 24 },
-	{ "10431493", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "104314D3", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "104314E3", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431503", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431533", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431573", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431663", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 1000, 4500, 24 },
-	{ "104316D3", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
-	{ "104316F3", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
-	{ "104317F3", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431863", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "104318D3", I2C, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 },
-	{ "10431C9F", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431CAF", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431CCF", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431CDF", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431CEF", SPI, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
-	{ "10431D1F", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431DA2", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
-	{ "10431E02", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
-	{ "10431EE2", I2C, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 },
-	{ "10431F12", I2C, 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
-	{ "10431F1F", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 },
-	{ "10431F62", SPI, 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
+	{ "103C89C6", 2, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, -1, -1, -1, 1000, 4500, 24 },
+	{ "104312AF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431433", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431463", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431473", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 1000, 4500, 24 },
+	{ "10431483", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 1000, 4500, 24 },
+	{ "10431493", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "104314D3", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "104314E3", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431503", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431533", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431573", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431663", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 1000, 4500, 24 },
+	{ "104316D3", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
+	{ "104316F3", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
+	{ "104317F3", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431863", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "104318D3", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 },
+	{ "10431C9F", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431CAF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431CCF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431CDF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431CEF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
+	{ "10431D1F", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431DA2", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
+	{ "10431E02", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
+	{ "10431EE2", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 },
+	{ "10431F12", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
+	{ "10431F1F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 },
+	{ "10431F62", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
 	{}
 };
 
@@ -212,7 +208,7 @@ static int generic_dsd_config(struct cs35l41_hda *cs35l41, struct device *physde
 			 "_DSD already exists.\n");
 	}
 
-	if (cfg->bus == SPI) {
+	if (cs35l41->control_bus == SPI) {
 		cs35l41->index = id;
 
 		/*
diff --git a/sound/pci/hda/cs35l41_hda_spi.c b/sound/pci/hda/cs35l41_hda_spi.c
index eb287aa5f7825..b76c0dfd5fefc 100644
--- a/sound/pci/hda/cs35l41_hda_spi.c
+++ b/sound/pci/hda/cs35l41_hda_spi.c
@@ -26,7 +26,7 @@ static int cs35l41_hda_spi_probe(struct spi_device *spi)
 		return -ENODEV;
 
 	return cs35l41_hda_probe(&spi->dev, device_name, spi_get_chipselect(spi, 0), spi->irq,
-				 devm_regmap_init_spi(spi, &cs35l41_regmap_spi));
+				 devm_regmap_init_spi(spi, &cs35l41_regmap_spi), SPI);
 }
 
 static void cs35l41_hda_spi_remove(struct spi_device *spi)

From 576145977b774f8a313702cf070d0ae690b7eed7 Mon Sep 17 00:00:00 2001
From: Dorian Cruveiller <doriancruveiller@gmail.com>
Date: Sat, 30 Dec 2023 12:43:12 +0100
Subject: [PATCH 271/304] ALSA: hda: Add driver properties for cs35l41 for
 Lenovo Legion Slim 7 Gen 8 serie

commit ba7053b4b4a4ddcf530fa2b897e697004715d086 upstream.

Add driver properties on 4 models of this laptop serie since they don't
have _DSD in the ACPI table

Signed-off-by: Dorian Cruveiller <doriancruveiller@gmail.com>
Cc: <stable@vger.kernel.org> # v6.7
Link: https://lore.kernel.org/r/20231230114312.22118-1-doriancruveiller@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/cs35l41_hda_property.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c
index fb15fa21abf44..a51fb6b0f56d6 100644
--- a/sound/pci/hda/cs35l41_hda_property.c
+++ b/sound/pci/hda/cs35l41_hda_property.c
@@ -71,6 +71,10 @@ static const struct cs35l41_config cs35l41_config_table[] = {
 	{ "10431F12", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
 	{ "10431F1F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 },
 	{ "10431F62", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 },
+	{ "17AA38B4", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 },
+	{ "17AA38B5", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 },
+	{ "17AA38B6", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 },
+	{ "17AA38B7", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 },
 	{}
 };
 
@@ -379,6 +383,10 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = {
 	{ "CSC3551", "10431F12", generic_dsd_config },
 	{ "CSC3551", "10431F1F", generic_dsd_config },
 	{ "CSC3551", "10431F62", generic_dsd_config },
+	{ "CSC3551", "17AA38B4", generic_dsd_config },
+	{ "CSC3551", "17AA38B5", generic_dsd_config },
+	{ "CSC3551", "17AA38B6", generic_dsd_config },
+	{ "CSC3551", "17AA38B7", generic_dsd_config },
 	{}
 };
 

From 6f33a312264f77c4371e1eeaea65da5030d89698 Mon Sep 17 00:00:00 2001
From: Dorian Cruveiller <doriancruveiller@gmail.com>
Date: Sat, 30 Dec 2023 12:40:01 +0100
Subject: [PATCH 272/304] ALSA: hda/realtek: enable SND_PCI_QUIRK for Lenovo
 Legion Slim 7 Gen 8 (2023) serie

commit 99af5b11c57d33c32d761797f6308b40936c22ed upstream.

Link up the realtek audio chip to the cirrus cs35l41 sound amplifier chip
on 4 models of the Lenovo legion slim 7 gen 8 (2023). These models are
16IRH8 (2 differents subsystem id) and 16APH8 (2 differents subsystem ids).

Subsystem ids list:
 - 17AA38B4
 - 17AA38B5
 - 17AA38B6
 - 17AA38B7

Signed-off-by: Dorian Cruveiller <doriancruveiller@gmail.com>
Cc: <stable@vger.kernel.org> # v6.7
Link: https://lore.kernel.org/r/20231230114001.19855-1-doriancruveiller@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 942722a04e80f..296a8f695e30f 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -10242,6 +10242,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x3886, "Y780 VECO DUAL", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38a7, "Y780P AMD YG dual", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38a8, "Y780P AMD VECO dual", ALC287_FIXUP_TAS2781_I2C),
+	SND_PCI_QUIRK(0x17aa, 0x38b4, "Legion Slim 7 16IRH8", ALC287_FIXUP_CS35L41_I2C_2),
+	SND_PCI_QUIRK(0x17aa, 0x38b5, "Legion Slim 7 16IRH8", ALC287_FIXUP_CS35L41_I2C_2),
+	SND_PCI_QUIRK(0x17aa, 0x38b6, "Legion Slim 7 16APH8", ALC287_FIXUP_CS35L41_I2C_2),
+	SND_PCI_QUIRK(0x17aa, 0x38b7, "Legion Slim 7 16APH8", ALC287_FIXUP_CS35L41_I2C_2),
 	SND_PCI_QUIRK(0x17aa, 0x38ba, "Yoga S780-14.5 Air AMD quad YC", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38bb, "Yoga S780-14.5 Air AMD quad AAC", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C),

From 8a975bfe02e8bdab70c1900cb1725c27f1ecbbd8 Mon Sep 17 00:00:00 2001
From: Tom Jason Schwanke <tom@catboys.cloud>
Date: Mon, 8 Jan 2024 16:15:21 +0100
Subject: [PATCH 273/304] ALSA: hda/realtek: Fix mute and mic-mute LEDs for HP
 Envy X360 13-ay0xxx

commit 6b3d14b7f9b1acaf7303d8499836bf78ee9c470c upstream.

This enables the mute and mic-mute LEDs on the HP Envy X360 13-ay0xxx
convertibles.
The quirk 'ALC245_FIXUP_HP_X360_MUTE_LEDS' already exists and is now
enabled for this device.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216197
Signed-off-by: Tom Jason Schwanke <tom@catboys.cloud>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/651b26e9-e86b-45dd-aa90-3e43d6b99823@catboys.cloud
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 296a8f695e30f..04a3dffcb4127 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9829,6 +9829,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x8735, "HP ProBook 435 G7", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
 	SND_PCI_QUIRK(0x103c, 0x8736, "HP", ALC285_FIXUP_HP_GPIO_AMP_INIT),
 	SND_PCI_QUIRK(0x103c, 0x8760, "HP", ALC285_FIXUP_HP_MUTE_LED),
+	SND_PCI_QUIRK(0x103c, 0x876e, "HP ENVY x360 Convertible 13-ay0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS),
 	SND_PCI_QUIRK(0x103c, 0x877a, "HP", ALC285_FIXUP_HP_MUTE_LED),
 	SND_PCI_QUIRK(0x103c, 0x877d, "HP", ALC236_FIXUP_HP_MUTE_LED),
 	SND_PCI_QUIRK(0x103c, 0x8780, "HP ZBook Fury 17 G7 Mobile Workstation",

From 2b12fe9daa3e2ae2c637ad6f25de7436bbfa6571 Mon Sep 17 00:00:00 2001
From: Lorenz Brun <lorenz@brun.one>
Date: Tue, 2 Jan 2024 22:48:20 +0100
Subject: [PATCH 274/304] ALSA: hda: cs35l41: Support more HP models without
 _DSD

commit 7d65d70161ef75a3991480c91668ac11acedf211 upstream.

This adds overrides for a series of notebooks using a common config
taken from HP's proprietary Windows driver.

This has been tested on a HP 15-ey0xxxx device (subsystem 103C8A31)
together with another Realtek quirk and the calibration files from the
proprietary driver.

Signed-off-by: Lorenz Brun <lorenz@brun.one>
Cc: <stable@vger.kernel.org> # v6.7
Link: https://lore.kernel.org/r/20240102214821.3394810-1-lorenz@brun.one
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/cs35l41_hda_property.c | 44 ++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c
index a51fb6b0f56d6..35277ce890a46 100644
--- a/sound/pci/hda/cs35l41_hda_property.c
+++ b/sound/pci/hda/cs35l41_hda_property.c
@@ -42,6 +42,28 @@ static const struct cs35l41_config cs35l41_config_table[] = {
  * in the ACPI. The Reset GPIO is also valid, so we can use the Reset defined in _DSD.
  */
 	{ "103C89C6", 2, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, -1, -1, -1, 1000, 4500, 24 },
+	{ "103C8A28", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A29", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A2A", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A2B", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A2C", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A2D", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A2E", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A30", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8A31", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BB3", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BB4", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BDF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE0", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE1", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE2", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE9", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BDD", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BDE", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE3", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE5", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8BE6", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
+	{ "103C8B3A", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4100, 24 },
 	{ "104312AF", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 },
 	{ "10431433", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
 	{ "10431463", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 },
@@ -354,6 +376,28 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = {
 	{ "CSC3551", "10280BEB", generic_dsd_config },
 	{ "CSC3551", "10280C4D", generic_dsd_config },
 	{ "CSC3551", "103C89C6", generic_dsd_config },
+	{ "CSC3551", "103C8A28", generic_dsd_config },
+	{ "CSC3551", "103C8A29", generic_dsd_config },
+	{ "CSC3551", "103C8A2A", generic_dsd_config },
+	{ "CSC3551", "103C8A2B", generic_dsd_config },
+	{ "CSC3551", "103C8A2C", generic_dsd_config },
+	{ "CSC3551", "103C8A2D", generic_dsd_config },
+	{ "CSC3551", "103C8A2E", generic_dsd_config },
+	{ "CSC3551", "103C8A30", generic_dsd_config },
+	{ "CSC3551", "103C8A31", generic_dsd_config },
+	{ "CSC3551", "103C8BB3", generic_dsd_config },
+	{ "CSC3551", "103C8BB4", generic_dsd_config },
+	{ "CSC3551", "103C8BDF", generic_dsd_config },
+	{ "CSC3551", "103C8BE0", generic_dsd_config },
+	{ "CSC3551", "103C8BE1", generic_dsd_config },
+	{ "CSC3551", "103C8BE2", generic_dsd_config },
+	{ "CSC3551", "103C8BE9", generic_dsd_config },
+	{ "CSC3551", "103C8BDD", generic_dsd_config },
+	{ "CSC3551", "103C8BDE", generic_dsd_config },
+	{ "CSC3551", "103C8BE3", generic_dsd_config },
+	{ "CSC3551", "103C8BE5", generic_dsd_config },
+	{ "CSC3551", "103C8BE6", generic_dsd_config },
+	{ "CSC3551", "103C8B3A", generic_dsd_config },
 	{ "CSC3551", "104312AF", generic_dsd_config },
 	{ "CSC3551", "10431433", generic_dsd_config },
 	{ "CSC3551", "10431463", generic_dsd_config },

From e848c4bb56d9b0bf0d222bf311f5eea72d06b1e7 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 23 Dec 2023 15:57:06 +0100
Subject: [PATCH 275/304] ACPI: resource: Add another DMI match for the
 TongFang GMxXGxx

commit df0cced74159c79e36ce7971f0bf250673296d93 upstream.

The TongFang GMxXGxx, which needs IRQ overriding for the keyboard to work,
is also sold as the Eluktronics RP-15 which does not use the standard
TongFang GMxXGxx DMI board_name.

Add an entry for this laptop to the irq1_edge_low_force_override[] DMI
table to make the internal keyboard functional.

Reported-by: Luis Acuna <ldacuna@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/resource.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 9bd9f79cd4099..c3536c236be99 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -510,6 +510,13 @@ static const struct dmi_system_id irq1_edge_low_force_override[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "GMxXGxx"),
 		},
 	},
+	{
+		/* TongFang GMxXGxx sold as Eluktronics Inc. RP-15 */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Eluktronics Inc."),
+			DMI_MATCH(DMI_BOARD_NAME, "RP-15"),
+		},
+	},
 	{
 		/* TongFang GM6XGxX/TUXEDO Stellaris 16 Gen5 AMD */
 		.matches = {

From ba0bc8bf90c9d84807024cfe6b008138ea970981 Mon Sep 17 00:00:00 2001
From: Sjoerd Simons <sjoerd@collabora.com>
Date: Tue, 28 Nov 2023 22:35:04 +0100
Subject: [PATCH 276/304] bus: moxtet: Mark the irq as shared
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e7830f5a83e96d8cb8efc0412902a03008f8fbe3 upstream.

The Turris Mox shares the moxtet IRQ with various devices on the board,
so mark the IRQ as shared in the driver as well.

Without this loading the module will fail with:
  genirq: Flags mismatch irq 40. 00002002 (moxtet) vs. 00002080 (mcp7940x)

Signed-off-by: Sjoerd Simons <sjoerd@collabora.com>
Cc:  <stable@vger.kernel.org> # v6.2+
Reviewed-by: Marek Behún <kabel@kernel.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bus/moxtet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c
index 5eb0fe73ddc45..48c18f95660ab 100644
--- a/drivers/bus/moxtet.c
+++ b/drivers/bus/moxtet.c
@@ -755,7 +755,7 @@ static int moxtet_irq_setup(struct moxtet *moxtet)
 	moxtet->irq.masked = ~0;
 
 	ret = request_threaded_irq(moxtet->dev_irq, NULL, moxtet_irq_thread_fn,
-				   IRQF_ONESHOT, "moxtet", moxtet);
+				   IRQF_SHARED | IRQF_ONESHOT, "moxtet", moxtet);
 	if (ret < 0)
 		goto err_free;
 

From dc272a4d63025dba31fbdec2ec2043c591942d66 Mon Sep 17 00:00:00 2001
From: Sjoerd Simons <sjoerd@collabora.com>
Date: Tue, 28 Nov 2023 22:35:05 +0100
Subject: [PATCH 277/304] bus: moxtet: Add spi device table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit aaafe88d5500ba18b33be72458439367ef878788 upstream.

The moxtet module fails to auto-load on. Add a SPI id table to
allow it to do so.

Signed-off-by: Sjoerd Simons <sjoerd@collabora.com>
Cc:  <stable@vger.kernel.org>
Reviewed-by: Marek Behún <kabel@kernel.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bus/moxtet.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c
index 48c18f95660ab..e384fbc6c1d93 100644
--- a/drivers/bus/moxtet.c
+++ b/drivers/bus/moxtet.c
@@ -830,6 +830,12 @@ static void moxtet_remove(struct spi_device *spi)
 	mutex_destroy(&moxtet->lock);
 }
 
+static const struct spi_device_id moxtet_spi_ids[] = {
+	{ "moxtet" },
+	{ },
+};
+MODULE_DEVICE_TABLE(spi, moxtet_spi_ids);
+
 static const struct of_device_id moxtet_dt_ids[] = {
 	{ .compatible = "cznic,moxtet" },
 	{},
@@ -841,6 +847,7 @@ static struct spi_driver moxtet_spi_driver = {
 		.name		= "moxtet",
 		.of_match_table = moxtet_dt_ids,
 	},
+	.id_table	= moxtet_spi_ids,
 	.probe		= moxtet_probe,
 	.remove		= moxtet_remove,
 };

From f90fb3a482d1d4705603ab6c320de0ccd611055c Mon Sep 17 00:00:00 2001
From: Lewis Huang <lewis.huang@amd.com>
Date: Fri, 1 Dec 2023 06:25:02 -0700
Subject: [PATCH 278/304] drm/amd/display: Pass pwrseq inst for backlight and
 ABM

commit b17ef04bf3a4346d66404454d6a646343ddc9749 upstream.

[Why]
OTG inst and pwrseq inst mapping is not align therefore we cannot use
otg_inst as pwrseq inst to get DCIO register.

[How]
1. Pass the correct pwrseq instance to dmub when set abm pipe.
2. LVTMA control index change from panel_inst to pwrseq_inst.

Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Reviewed-by: Phil Hsieh <phil.hsieh@amd.com>
Acked-by: Rodrigo Siqueira <rodrigo.siqueira@amd.com>
Signed-off-by: Lewis Huang <lewis.huang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../drm/amd/display/dc/bios/bios_parser2.c    |  4 +-
 .../drm/amd/display/dc/bios/command_table2.c  | 12 ++--
 .../drm/amd/display/dc/bios/command_table2.h  |  2 +-
 .../gpu/drm/amd/display/dc/dc_bios_types.h    |  2 +-
 drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c |  8 ++-
 .../gpu/drm/amd/display/dc/dce/dmub_abm_lcd.c |  7 ++-
 .../gpu/drm/amd/display/dc/dce/dmub_abm_lcd.h |  2 +-
 .../amd/display/dc/dcn31/dcn31_panel_cntl.c   |  5 +-
 .../amd/display/dc/hwss/dce110/dce110_hwseq.c | 16 ++---
 .../amd/display/dc/hwss/dcn21/dcn21_hwseq.c   | 36 ++++++++---
 drivers/gpu/drm/amd/display/dc/inc/hw/abm.h   |  3 +-
 .../drm/amd/display/dc/inc/hw/panel_cntl.h    |  2 +
 .../drm/amd/display/dc/link/link_factory.c    | 59 +++++++++++++------
 .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h   | 14 ++++-
 14 files changed, 119 insertions(+), 53 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
index 2d1f5efa9091a..b5b29451d2db8 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
@@ -1698,7 +1698,7 @@ static enum bp_result bios_parser_enable_disp_power_gating(
 static enum bp_result bios_parser_enable_lvtma_control(
 	struct dc_bios *dcb,
 	uint8_t uc_pwr_on,
-	uint8_t panel_instance,
+	uint8_t pwrseq_instance,
 	uint8_t bypass_panel_control_wait)
 {
 	struct bios_parser *bp = BP_FROM_DCB(dcb);
@@ -1706,7 +1706,7 @@ static enum bp_result bios_parser_enable_lvtma_control(
 	if (!bp->cmd_tbl.enable_lvtma_control)
 		return BP_RESULT_FAILURE;
 
-	return bp->cmd_tbl.enable_lvtma_control(bp, uc_pwr_on, panel_instance, bypass_panel_control_wait);
+	return bp->cmd_tbl.enable_lvtma_control(bp, uc_pwr_on, pwrseq_instance, bypass_panel_control_wait);
 }
 
 static bool bios_parser_is_accelerated_mode(
diff --git a/drivers/gpu/drm/amd/display/dc/bios/command_table2.c b/drivers/gpu/drm/amd/display/dc/bios/command_table2.c
index 90a02d7bd3da3..ab0adabf9dd4c 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/command_table2.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/command_table2.c
@@ -976,7 +976,7 @@ static unsigned int get_smu_clock_info_v3_1(struct bios_parser *bp, uint8_t id)
 static enum bp_result enable_lvtma_control(
 	struct bios_parser *bp,
 	uint8_t uc_pwr_on,
-	uint8_t panel_instance,
+	uint8_t pwrseq_instance,
 	uint8_t bypass_panel_control_wait);
 
 static void init_enable_lvtma_control(struct bios_parser *bp)
@@ -989,7 +989,7 @@ static void init_enable_lvtma_control(struct bios_parser *bp)
 static void enable_lvtma_control_dmcub(
 	struct dc_dmub_srv *dmcub,
 	uint8_t uc_pwr_on,
-	uint8_t panel_instance,
+	uint8_t pwrseq_instance,
 	uint8_t bypass_panel_control_wait)
 {
 
@@ -1002,8 +1002,8 @@ static void enable_lvtma_control_dmcub(
 			DMUB_CMD__VBIOS_LVTMA_CONTROL;
 	cmd.lvtma_control.data.uc_pwr_action =
 			uc_pwr_on;
-	cmd.lvtma_control.data.panel_inst =
-			panel_instance;
+	cmd.lvtma_control.data.pwrseq_inst =
+			pwrseq_instance;
 	cmd.lvtma_control.data.bypass_panel_control_wait =
 			bypass_panel_control_wait;
 	dm_execute_dmub_cmd(dmcub->ctx, &cmd, DM_DMUB_WAIT_TYPE_WAIT);
@@ -1012,7 +1012,7 @@ static void enable_lvtma_control_dmcub(
 static enum bp_result enable_lvtma_control(
 	struct bios_parser *bp,
 	uint8_t uc_pwr_on,
-	uint8_t panel_instance,
+	uint8_t pwrseq_instance,
 	uint8_t bypass_panel_control_wait)
 {
 	enum bp_result result = BP_RESULT_FAILURE;
@@ -1021,7 +1021,7 @@ static enum bp_result enable_lvtma_control(
 	    bp->base.ctx->dc->debug.dmub_command_table) {
 		enable_lvtma_control_dmcub(bp->base.ctx->dmub_srv,
 				uc_pwr_on,
-				panel_instance,
+				pwrseq_instance,
 				bypass_panel_control_wait);
 		return BP_RESULT_OK;
 	}
diff --git a/drivers/gpu/drm/amd/display/dc/bios/command_table2.h b/drivers/gpu/drm/amd/display/dc/bios/command_table2.h
index b6d09bf6cf72b..41c8c014397f2 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/command_table2.h
+++ b/drivers/gpu/drm/amd/display/dc/bios/command_table2.h
@@ -96,7 +96,7 @@ struct cmd_tbl {
 			struct bios_parser *bp, uint8_t id);
 	enum bp_result (*enable_lvtma_control)(struct bios_parser *bp,
 			uint8_t uc_pwr_on,
-			uint8_t panel_instance,
+			uint8_t pwrseq_instance,
 			uint8_t bypass_panel_control_wait);
 };
 
diff --git a/drivers/gpu/drm/amd/display/dc/dc_bios_types.h b/drivers/gpu/drm/amd/display/dc/dc_bios_types.h
index be9aa1a71847d..26940d94d8fb4 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_bios_types.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_bios_types.h
@@ -140,7 +140,7 @@ struct dc_vbios_funcs {
 	enum bp_result (*enable_lvtma_control)(
 		struct dc_bios *bios,
 		uint8_t uc_pwr_on,
-		uint8_t panel_instance,
+		uint8_t pwrseq_instance,
 		uint8_t bypass_panel_control_wait);
 
 	enum bp_result (*get_soc_bb_info)(
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c
index d3e6544022b78..930fd929e93a4 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c
+++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c
@@ -145,7 +145,11 @@ static bool dmub_abm_save_restore_ex(
 	return ret;
 }
 
-static bool dmub_abm_set_pipe_ex(struct abm *abm, uint32_t otg_inst, uint32_t option, uint32_t panel_inst)
+static bool dmub_abm_set_pipe_ex(struct abm *abm,
+		uint32_t otg_inst,
+		uint32_t option,
+		uint32_t panel_inst,
+		uint32_t pwrseq_inst)
 {
 	bool ret = false;
 	unsigned int feature_support;
@@ -153,7 +157,7 @@ static bool dmub_abm_set_pipe_ex(struct abm *abm, uint32_t otg_inst, uint32_t op
 	feature_support = abm_feature_support(abm, panel_inst);
 
 	if (feature_support == ABM_LCD_SUPPORT)
-		ret = dmub_abm_set_pipe(abm, otg_inst, option, panel_inst);
+		ret = dmub_abm_set_pipe(abm, otg_inst, option, panel_inst, pwrseq_inst);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.c
index 592a8f7a1c6d0..42c802afc4681 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.c
+++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.c
@@ -254,7 +254,11 @@ bool dmub_abm_save_restore(
 	return true;
 }
 
-bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst, uint32_t option, uint32_t panel_inst)
+bool dmub_abm_set_pipe(struct abm *abm,
+		uint32_t otg_inst,
+		uint32_t option,
+		uint32_t panel_inst,
+		uint32_t pwrseq_inst)
 {
 	union dmub_rb_cmd cmd;
 	struct dc_context *dc = abm->ctx;
@@ -264,6 +268,7 @@ bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst, uint32_t option, uint
 	cmd.abm_set_pipe.header.type = DMUB_CMD__ABM;
 	cmd.abm_set_pipe.header.sub_type = DMUB_CMD__ABM_SET_PIPE;
 	cmd.abm_set_pipe.abm_set_pipe_data.otg_inst = otg_inst;
+	cmd.abm_set_pipe.abm_set_pipe_data.pwrseq_inst = pwrseq_inst;
 	cmd.abm_set_pipe.abm_set_pipe_data.set_pipe_option = option;
 	cmd.abm_set_pipe.abm_set_pipe_data.panel_inst = panel_inst;
 	cmd.abm_set_pipe.abm_set_pipe_data.ramping_boundary = ramping_boundary;
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.h b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.h
index 853564d7f4714..07ea6c8d414f3 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.h
+++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm_lcd.h
@@ -44,7 +44,7 @@ bool dmub_abm_save_restore(
 		struct dc_context *dc,
 		unsigned int panel_inst,
 		struct abm_save_restore *pData);
-bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst, uint32_t option, uint32_t panel_inst);
+bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst, uint32_t option, uint32_t panel_inst, uint32_t pwrseq_inst);
 bool dmub_abm_set_backlight_level(struct abm *abm,
 		unsigned int backlight_pwm_u16_16,
 		unsigned int frame_ramp,
diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c
index 217acd4e292a3..d849b1eaa4a5c 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c
@@ -50,7 +50,7 @@ static bool dcn31_query_backlight_info(struct panel_cntl *panel_cntl, union dmub
 	cmd->panel_cntl.header.type = DMUB_CMD__PANEL_CNTL;
 	cmd->panel_cntl.header.sub_type = DMUB_CMD__PANEL_CNTL_QUERY_BACKLIGHT_INFO;
 	cmd->panel_cntl.header.payload_bytes = sizeof(cmd->panel_cntl.data);
-	cmd->panel_cntl.data.inst = dcn31_panel_cntl->base.inst;
+	cmd->panel_cntl.data.pwrseq_inst = dcn31_panel_cntl->base.pwrseq_inst;
 
 	return dm_execute_dmub_cmd(dc_dmub_srv->ctx, cmd, DM_DMUB_WAIT_TYPE_WAIT_WITH_REPLY);
 }
@@ -78,7 +78,7 @@ static uint32_t dcn31_panel_cntl_hw_init(struct panel_cntl *panel_cntl)
 	cmd.panel_cntl.header.type = DMUB_CMD__PANEL_CNTL;
 	cmd.panel_cntl.header.sub_type = DMUB_CMD__PANEL_CNTL_HW_INIT;
 	cmd.panel_cntl.header.payload_bytes = sizeof(cmd.panel_cntl.data);
-	cmd.panel_cntl.data.inst = dcn31_panel_cntl->base.inst;
+	cmd.panel_cntl.data.pwrseq_inst = dcn31_panel_cntl->base.pwrseq_inst;
 	cmd.panel_cntl.data.bl_pwm_cntl = panel_cntl->stored_backlight_registers.BL_PWM_CNTL;
 	cmd.panel_cntl.data.bl_pwm_period_cntl = panel_cntl->stored_backlight_registers.BL_PWM_PERIOD_CNTL;
 	cmd.panel_cntl.data.bl_pwm_ref_div1 =
@@ -157,4 +157,5 @@ void dcn31_panel_cntl_construct(
 	dcn31_panel_cntl->base.funcs = &dcn31_link_panel_cntl_funcs;
 	dcn31_panel_cntl->base.ctx = init_data->ctx;
 	dcn31_panel_cntl->base.inst = init_data->inst;
+	dcn31_panel_cntl->base.pwrseq_inst = init_data->pwrseq_inst;
 }
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
index 960a55e06375b..9b8299d97e400 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
@@ -790,7 +790,7 @@ void dce110_edp_power_control(
 	struct dc_context *ctx = link->ctx;
 	struct bp_transmitter_control cntl = { 0 };
 	enum bp_result bp_result;
-	uint8_t panel_instance;
+	uint8_t pwrseq_instance;
 
 
 	if (dal_graphics_object_id_get_connector_id(link->link_enc->connector)
@@ -873,7 +873,7 @@ void dce110_edp_power_control(
 		cntl.coherent = false;
 		cntl.lanes_number = LANE_COUNT_FOUR;
 		cntl.hpd_sel = link->link_enc->hpd_source;
-		panel_instance = link->panel_cntl->inst;
+		pwrseq_instance = link->panel_cntl->pwrseq_inst;
 
 		if (ctx->dc->ctx->dmub_srv &&
 				ctx->dc->debug.dmub_command_table) {
@@ -881,11 +881,11 @@ void dce110_edp_power_control(
 			if (cntl.action == TRANSMITTER_CONTROL_POWER_ON) {
 				bp_result = ctx->dc_bios->funcs->enable_lvtma_control(ctx->dc_bios,
 						LVTMA_CONTROL_POWER_ON,
-						panel_instance, link->link_powered_externally);
+						pwrseq_instance, link->link_powered_externally);
 			} else {
 				bp_result = ctx->dc_bios->funcs->enable_lvtma_control(ctx->dc_bios,
 						LVTMA_CONTROL_POWER_OFF,
-						panel_instance, link->link_powered_externally);
+						pwrseq_instance, link->link_powered_externally);
 			}
 		}
 
@@ -956,7 +956,7 @@ void dce110_edp_backlight_control(
 {
 	struct dc_context *ctx = link->ctx;
 	struct bp_transmitter_control cntl = { 0 };
-	uint8_t panel_instance;
+	uint8_t pwrseq_instance;
 	unsigned int pre_T11_delay = OLED_PRE_T11_DELAY;
 	unsigned int post_T7_delay = OLED_POST_T7_DELAY;
 
@@ -1009,7 +1009,7 @@ void dce110_edp_backlight_control(
 	 */
 	/* dc_service_sleep_in_milliseconds(50); */
 		/*edp 1.2*/
-	panel_instance = link->panel_cntl->inst;
+	pwrseq_instance = link->panel_cntl->pwrseq_inst;
 
 	if (cntl.action == TRANSMITTER_CONTROL_BACKLIGHT_ON) {
 		if (!link->dc->config.edp_no_power_sequencing)
@@ -1034,11 +1034,11 @@ void dce110_edp_backlight_control(
 		if (cntl.action == TRANSMITTER_CONTROL_BACKLIGHT_ON)
 			ctx->dc_bios->funcs->enable_lvtma_control(ctx->dc_bios,
 					LVTMA_CONTROL_LCD_BLON,
-					panel_instance, link->link_powered_externally);
+					pwrseq_instance, link->link_powered_externally);
 		else
 			ctx->dc_bios->funcs->enable_lvtma_control(ctx->dc_bios,
 					LVTMA_CONTROL_LCD_BLOFF,
-					panel_instance, link->link_powered_externally);
+					pwrseq_instance, link->link_powered_externally);
 	}
 
 	link_transmitter_control(ctx->dc_bios, &cntl);
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c
index 467812cf33686..08783ad097d21 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c
@@ -137,7 +137,8 @@ void dcn21_PLAT_58856_wa(struct dc_state *context, struct pipe_ctx *pipe_ctx)
 	pipe_ctx->stream->dpms_off = true;
 }
 
-static bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst, uint32_t option, uint32_t panel_inst)
+static bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst,
+		uint32_t option, uint32_t panel_inst, uint32_t pwrseq_inst)
 {
 	union dmub_rb_cmd cmd;
 	struct dc_context *dc = abm->ctx;
@@ -147,6 +148,7 @@ static bool dmub_abm_set_pipe(struct abm *abm, uint32_t otg_inst, uint32_t optio
 	cmd.abm_set_pipe.header.type = DMUB_CMD__ABM;
 	cmd.abm_set_pipe.header.sub_type = DMUB_CMD__ABM_SET_PIPE;
 	cmd.abm_set_pipe.abm_set_pipe_data.otg_inst = otg_inst;
+	cmd.abm_set_pipe.abm_set_pipe_data.pwrseq_inst = pwrseq_inst;
 	cmd.abm_set_pipe.abm_set_pipe_data.set_pipe_option = option;
 	cmd.abm_set_pipe.abm_set_pipe_data.panel_inst = panel_inst;
 	cmd.abm_set_pipe.abm_set_pipe_data.ramping_boundary = ramping_boundary;
@@ -179,7 +181,6 @@ void dcn21_set_abm_immediate_disable(struct pipe_ctx *pipe_ctx)
 	struct abm *abm = pipe_ctx->stream_res.abm;
 	uint32_t otg_inst = pipe_ctx->stream_res.tg->inst;
 	struct panel_cntl *panel_cntl = pipe_ctx->stream->link->panel_cntl;
-
 	struct dmcu *dmcu = pipe_ctx->stream->ctx->dc->res_pool->dmcu;
 
 	if (dmcu) {
@@ -190,9 +191,13 @@ void dcn21_set_abm_immediate_disable(struct pipe_ctx *pipe_ctx)
 	if (abm && panel_cntl) {
 		if (abm->funcs && abm->funcs->set_pipe_ex) {
 			abm->funcs->set_pipe_ex(abm, otg_inst, SET_ABM_PIPE_IMMEDIATELY_DISABLE,
-			panel_cntl->inst);
+					panel_cntl->inst, panel_cntl->pwrseq_inst);
 		} else {
-			dmub_abm_set_pipe(abm, otg_inst, SET_ABM_PIPE_IMMEDIATELY_DISABLE, panel_cntl->inst);
+				dmub_abm_set_pipe(abm,
+						otg_inst,
+						SET_ABM_PIPE_IMMEDIATELY_DISABLE,
+						panel_cntl->inst,
+						panel_cntl->pwrseq_inst);
 		}
 		panel_cntl->funcs->store_backlight_level(panel_cntl);
 	}
@@ -212,9 +217,16 @@ void dcn21_set_pipe(struct pipe_ctx *pipe_ctx)
 
 	if (abm && panel_cntl) {
 		if (abm->funcs && abm->funcs->set_pipe_ex) {
-			abm->funcs->set_pipe_ex(abm, otg_inst, SET_ABM_PIPE_NORMAL, panel_cntl->inst);
+			abm->funcs->set_pipe_ex(abm,
+					otg_inst,
+					SET_ABM_PIPE_NORMAL,
+					panel_cntl->inst,
+					panel_cntl->pwrseq_inst);
 		} else {
-			dmub_abm_set_pipe(abm, otg_inst, SET_ABM_PIPE_NORMAL, panel_cntl->inst);
+				dmub_abm_set_pipe(abm, otg_inst,
+						SET_ABM_PIPE_NORMAL,
+						panel_cntl->inst,
+						panel_cntl->pwrseq_inst);
 		}
 	}
 }
@@ -237,9 +249,17 @@ bool dcn21_set_backlight_level(struct pipe_ctx *pipe_ctx,
 
 		if (abm && panel_cntl) {
 			if (abm->funcs && abm->funcs->set_pipe_ex) {
-				abm->funcs->set_pipe_ex(abm, otg_inst, SET_ABM_PIPE_NORMAL, panel_cntl->inst);
+				abm->funcs->set_pipe_ex(abm,
+						otg_inst,
+						SET_ABM_PIPE_NORMAL,
+						panel_cntl->inst,
+						panel_cntl->pwrseq_inst);
 			} else {
-				dmub_abm_set_pipe(abm, otg_inst, SET_ABM_PIPE_NORMAL, panel_cntl->inst);
+					dmub_abm_set_pipe(abm,
+							otg_inst,
+							SET_ABM_PIPE_NORMAL,
+							panel_cntl->inst,
+							panel_cntl->pwrseq_inst);
 			}
 		}
 	}
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/abm.h b/drivers/gpu/drm/amd/display/dc/inc/hw/abm.h
index 33db15d69f233..9f521cf0fc5a2 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/abm.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/abm.h
@@ -64,7 +64,8 @@ struct abm_funcs {
 	bool (*set_pipe_ex)(struct abm *abm,
 			unsigned int otg_inst,
 			unsigned int option,
-			unsigned int panel_inst);
+			unsigned int panel_inst,
+			unsigned int pwrseq_inst);
 };
 
 #endif
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h b/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h
index 24af9d80b9373..248adc1705e35 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/panel_cntl.h
@@ -56,12 +56,14 @@ struct panel_cntl_funcs {
 struct panel_cntl_init_data {
 	struct dc_context *ctx;
 	uint32_t inst;
+	uint32_t pwrseq_inst;
 };
 
 struct panel_cntl {
 	const struct panel_cntl_funcs *funcs;
 	struct dc_context *ctx;
 	uint32_t inst;
+	uint32_t pwrseq_inst;
 	/* registers setting needs to be saved and restored at InitBacklight */
 	struct panel_cntl_backlight_registers stored_backlight_registers;
 };
diff --git a/drivers/gpu/drm/amd/display/dc/link/link_factory.c b/drivers/gpu/drm/amd/display/dc/link/link_factory.c
index 7abfc67d10a62..ff7801aa552a4 100644
--- a/drivers/gpu/drm/amd/display/dc/link/link_factory.c
+++ b/drivers/gpu/drm/amd/display/dc/link/link_factory.c
@@ -368,6 +368,30 @@ static enum transmitter translate_encoder_to_transmitter(
 	}
 }
 
+static uint8_t translate_dig_inst_to_pwrseq_inst(struct dc_link *link)
+{
+	uint8_t pwrseq_inst = 0xF;
+	struct dc_context *dc_ctx = link->dc->ctx;
+
+	DC_LOGGER_INIT(dc_ctx->logger);
+
+	switch (link->eng_id) {
+	case ENGINE_ID_DIGA:
+		pwrseq_inst = 0;
+		break;
+	case ENGINE_ID_DIGB:
+		pwrseq_inst = 1;
+		break;
+	default:
+		DC_LOG_WARNING("Unsupported pwrseq engine id: %d!\n", link->eng_id);
+		ASSERT(false);
+		break;
+	}
+
+	return pwrseq_inst;
+}
+
+
 static void link_destruct(struct dc_link *link)
 {
 	int i;
@@ -595,24 +619,6 @@ static bool construct_phy(struct dc_link *link,
 	link->ddc_hw_inst =
 		dal_ddc_get_line(get_ddc_pin(link->ddc));
 
-
-	if (link->dc->res_pool->funcs->panel_cntl_create &&
-		(link->link_id.id == CONNECTOR_ID_EDP ||
-			link->link_id.id == CONNECTOR_ID_LVDS)) {
-		panel_cntl_init_data.ctx = dc_ctx;
-		panel_cntl_init_data.inst =
-			panel_cntl_init_data.ctx->dc_edp_id_count;
-		link->panel_cntl =
-			link->dc->res_pool->funcs->panel_cntl_create(
-								&panel_cntl_init_data);
-		panel_cntl_init_data.ctx->dc_edp_id_count++;
-
-		if (link->panel_cntl == NULL) {
-			DC_ERROR("Failed to create link panel_cntl!\n");
-			goto panel_cntl_create_fail;
-		}
-	}
-
 	enc_init_data.ctx = dc_ctx;
 	bp_funcs->get_src_obj(dc_ctx->dc_bios, link->link_id, 0,
 			      &enc_init_data.encoder);
@@ -643,6 +649,23 @@ static bool construct_phy(struct dc_link *link,
 	link->dc->res_pool->dig_link_enc_count++;
 
 	link->link_enc_hw_inst = link->link_enc->transmitter;
+
+	if (link->dc->res_pool->funcs->panel_cntl_create &&
+		(link->link_id.id == CONNECTOR_ID_EDP ||
+			link->link_id.id == CONNECTOR_ID_LVDS)) {
+		panel_cntl_init_data.ctx = dc_ctx;
+		panel_cntl_init_data.inst = panel_cntl_init_data.ctx->dc_edp_id_count;
+		panel_cntl_init_data.pwrseq_inst = translate_dig_inst_to_pwrseq_inst(link);
+		link->panel_cntl =
+			link->dc->res_pool->funcs->panel_cntl_create(
+								&panel_cntl_init_data);
+		panel_cntl_init_data.ctx->dc_edp_id_count++;
+
+		if (link->panel_cntl == NULL) {
+			DC_ERROR("Failed to create link panel_cntl!\n");
+			goto panel_cntl_create_fail;
+		}
+	}
 	for (i = 0; i < 4; i++) {
 		if (bp_funcs->get_device_tag(dc_ctx->dc_bios,
 					     link->link_id, i,
diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
index ed4379c047151..3cea96a36432b 100644
--- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
+++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
@@ -3357,6 +3357,16 @@ struct dmub_cmd_abm_set_pipe_data {
 	 * TODO: Remove.
 	 */
 	uint8_t ramping_boundary;
+
+	/**
+	 * PwrSeq HW Instance.
+	 */
+	uint8_t pwrseq_inst;
+
+	/**
+	 * Explicit padding to 4 byte boundary.
+	 */
+	uint8_t pad[3];
 };
 
 /**
@@ -3737,7 +3747,7 @@ enum dmub_cmd_panel_cntl_type {
  * struct dmub_cmd_panel_cntl_data - Panel control data.
  */
 struct dmub_cmd_panel_cntl_data {
-	uint32_t inst; /**< panel instance */
+	uint32_t pwrseq_inst; /**< pwrseq instance */
 	uint32_t current_backlight; /* in/out */
 	uint32_t bl_pwm_cntl; /* in/out */
 	uint32_t bl_pwm_period_cntl; /* in/out */
@@ -3796,7 +3806,7 @@ struct dmub_cmd_lvtma_control_data {
 	uint8_t uc_pwr_action; /**< LVTMA_ACTION */
 	uint8_t bypass_panel_control_wait;
 	uint8_t reserved_0[2]; /**< For future use */
-	uint8_t panel_inst; /**< LVTMA control instance */
+	uint8_t pwrseq_inst; /**< LVTMA control instance */
 	uint8_t reserved_1[3]; /**< For future use */
 };
 

From 4f6638562db5f9568dbfbe3a2a6ac80c4832f19e Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Sun, 7 Jan 2024 21:24:07 +0900
Subject: [PATCH 279/304] ksmbd: don't allow O_TRUNC open on read-only share

commit d592a9158a112d419f341f035d18d02f8d232def upstream.

When file is changed using notepad on read-only share(read_only = yes in
ksmbd.conf), There is a problem where existing data is truncated.
notepad in windows try to O_TRUNC open(FILE_OVERWRITE_IF) and all data
in file is truncated. This patch don't allow  O_TRUNC open on read-only
share and add KSMBD_TREE_CONN_FLAG_WRITABLE check in smb2_set_info().

Cc: stable@vger.kernel.org
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/smb/server/smb2pdu.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 652ab429bf2e9..ca2c528c9de3a 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2971,7 +2971,7 @@ int smb2_open(struct ksmbd_work *work)
 					    &may_flags);
 
 	if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
-		if (open_flags & O_CREAT) {
+		if (open_flags & (O_CREAT | O_TRUNC)) {
 			ksmbd_debug(SMB,
 				    "User does not have write permission\n");
 			rc = -EACCES;
@@ -5943,12 +5943,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 	}
 	case FILE_RENAME_INFORMATION:
 	{
-		if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
-			ksmbd_debug(SMB,
-				    "User does not have write permission\n");
-			return -EACCES;
-		}
-
 		if (buf_len < sizeof(struct smb2_file_rename_info))
 			return -EINVAL;
 
@@ -5968,12 +5962,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 	}
 	case FILE_DISPOSITION_INFORMATION:
 	{
-		if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
-			ksmbd_debug(SMB,
-				    "User does not have write permission\n");
-			return -EACCES;
-		}
-
 		if (buf_len < sizeof(struct smb2_file_disposition_info))
 			return -EINVAL;
 
@@ -6035,7 +6023,7 @@ int smb2_set_info(struct ksmbd_work *work)
 {
 	struct smb2_set_info_req *req;
 	struct smb2_set_info_rsp *rsp;
-	struct ksmbd_file *fp;
+	struct ksmbd_file *fp = NULL;
 	int rc = 0;
 	unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
 
@@ -6055,6 +6043,13 @@ int smb2_set_info(struct ksmbd_work *work)
 		rsp = smb2_get_msg(work->response_buf);
 	}
 
+	if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+		ksmbd_debug(SMB, "User does not have write permission\n");
+		pr_err("User does not have write permission\n");
+		rc = -EACCES;
+		goto err_out;
+	}
+
 	if (!has_file_id(id)) {
 		id = req->VolatileFileId;
 		pid = req->PersistentFileId;

From 68f9b0945b19e8140239c6043f7a9b01f1d1a529 Mon Sep 17 00:00:00 2001
From: Fedor Pchelkin <pchelkin@ispras.ru>
Date: Tue, 9 Jan 2024 17:14:44 +0300
Subject: [PATCH 280/304] ksmbd: free ppace array on error in parse_dacl

commit 8cf9bedfc3c47d24bb0de386f808f925dc52863e upstream.

The ppace array is not freed if one of the init_acl_state() calls inside
parse_dacl() fails. At the moment the function may fail only due to the
memory allocation errors so it's highly unlikely in this case but
nevertheless a fix is needed.

Move ppace allocation after the init_acl_state() calls with proper error
handling.

Found by Linux Verification Center (linuxtesting.org).

Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3")
Cc: stable@vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/smb/server/smbacl.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 1164365533f08..1c9775f1efa56 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -401,10 +401,6 @@ static void parse_dacl(struct mnt_idmap *idmap,
 	if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
 		return;
 
-	ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
-	if (!ppace)
-		return;
-
 	ret = init_acl_state(&acl_state, num_aces);
 	if (ret)
 		return;
@@ -414,6 +410,13 @@ static void parse_dacl(struct mnt_idmap *idmap,
 		return;
 	}
 
+	ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
+	if (!ppace) {
+		free_acl_state(&default_acl_state);
+		free_acl_state(&acl_state);
+		return;
+	}
+
 	/*
 	 * reset rwx permissions for user/group/other.
 	 * Also, if num_aces is 0 i.e. DACL has no ACEs,

From 0de40f76d567133b871cd6ad46bb87afbce46983 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Wed, 8 Nov 2023 10:22:16 -0800
Subject: [PATCH 281/304] Revert "md/raid5: Wait for MD_SB_CHANGE_PENDING in
 raid5d"

commit bed9e27baf52a09b7ba2a3714f1e24e17ced386d upstream.

This reverts commit 5e2cf333b7bd5d3e62595a44d598a254c697cd74.

That commit introduced the following race and can cause system hung.

 md_write_start:             raid5d:
 // mddev->in_sync == 1
 set "MD_SB_CHANGE_PENDING"
                            // running before md_write_start wakeup it
                             waiting "MD_SB_CHANGE_PENDING" cleared
                             >>>>>>>>> hung
 wakeup mddev->thread
 ...
 waiting "MD_SB_CHANGE_PENDING" cleared
 >>>> hung, raid5d should clear this flag
 but get hung by same flag.

The issue reverted commit fixing is fixed by last patch in a new way.

Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
Cc: stable@vger.kernel.org # v5.19+
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20231108182216.73611-2-junxiao.bi@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/raid5.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 26e1e8a5e9419..b02b1a3010f71 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,7 +36,6 @@
  */
 
 #include <linux/blkdev.h>
-#include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
@@ -6820,18 +6819,7 @@ static void raid5d(struct md_thread *thread)
 			spin_unlock_irq(&conf->device_lock);
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
-
-			/*
-			 * Waiting on MD_SB_CHANGE_PENDING below may deadlock
-			 * seeing md_check_recovery() is needed to clear
-			 * the flag when using mdmon.
-			 */
-			continue;
 		}
-
-		wait_event_lock_irq(mddev->sb_wait,
-			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
-			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 

From 4a6eefb46b5f9dda9e7d4cde45efdbacf0c4b4ec Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 1 Dec 2023 17:21:30 +0000
Subject: [PATCH 282/304] binder: use EPOLLERR from eventpoll.h

commit 6ac061db9c58ca5b9270b1b3940d2464fb3ff183 upstream.

Use EPOLLERR instead of POLLERR to make sure it is cast to the correct
__poll_t type. This fixes the following sparse issue:

  drivers/android/binder.c:5030:24: warning: incorrect type in return expression (different base types)
  drivers/android/binder.c:5030:24:    expected restricted __poll_t
  drivers/android/binder.c:5030:24:    got int

Fixes: f88982679f54 ("binder: check for binder_thread allocation failure in binder_poll()")
Cc: stable@vger.kernel.org
Cc: Eric Biggers <ebiggers@google.com>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20231201172212.1813387-2-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 92128aae2d060..71a40a4c546f5 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -5030,7 +5030,7 @@ static __poll_t binder_poll(struct file *filp,
 
 	thread = binder_get_thread(proc);
 	if (!thread)
-		return POLLERR;
+		return EPOLLERR;
 
 	binder_inner_proc_lock(thread->proc);
 	thread->looper |= BINDER_LOOPER_STATE_POLL;

From e074686e993ff1be5f21b085a3b1b4275ccd5727 Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 1 Dec 2023 17:21:31 +0000
Subject: [PATCH 283/304] binder: fix use-after-free in shinker's callback

commit 3f489c2067c5824528212b0fc18b28d51332d906 upstream.

The mmap read lock is used during the shrinker's callback, which means
that using alloc->vma pointer isn't safe as it can race with munmap().
As of commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in
munmap") the mmap lock is downgraded after the vma has been isolated.

I was able to reproduce this issue by manually adding some delays and
triggering page reclaiming through the shrinker's debug sysfs. The
following KASAN report confirms the UAF:

  ==================================================================
  BUG: KASAN: slab-use-after-free in zap_page_range_single+0x470/0x4b8
  Read of size 8 at addr ffff356ed50e50f0 by task bash/478

  CPU: 1 PID: 478 Comm: bash Not tainted 6.6.0-rc5-00055-g1c8b86a3799f-dirty #70
  Hardware name: linux,dummy-virt (DT)
  Call trace:
   zap_page_range_single+0x470/0x4b8
   binder_alloc_free_page+0x608/0xadc
   __list_lru_walk_one+0x130/0x3b0
   list_lru_walk_node+0xc4/0x22c
   binder_shrink_scan+0x108/0x1dc
   shrinker_debugfs_scan_write+0x2b4/0x500
   full_proxy_write+0xd4/0x140
   vfs_write+0x1ac/0x758
   ksys_write+0xf0/0x1dc
   __arm64_sys_write+0x6c/0x9c

  Allocated by task 492:
   kmem_cache_alloc+0x130/0x368
   vm_area_alloc+0x2c/0x190
   mmap_region+0x258/0x18bc
   do_mmap+0x694/0xa60
   vm_mmap_pgoff+0x170/0x29c
   ksys_mmap_pgoff+0x290/0x3a0
   __arm64_sys_mmap+0xcc/0x144

  Freed by task 491:
   kmem_cache_free+0x17c/0x3c8
   vm_area_free_rcu_cb+0x74/0x98
   rcu_core+0xa38/0x26d4
   rcu_core_si+0x10/0x1c
   __do_softirq+0x2fc/0xd24

  Last potentially related work creation:
   __call_rcu_common.constprop.0+0x6c/0xba0
   call_rcu+0x10/0x1c
   vm_area_free+0x18/0x24
   remove_vma+0xe4/0x118
   do_vmi_align_munmap.isra.0+0x718/0xb5c
   do_vmi_munmap+0xdc/0x1fc
   __vm_munmap+0x10c/0x278
   __arm64_sys_munmap+0x58/0x7c

Fix this issue by performing instead a vma_lookup() which will fail to
find the vma that was isolated before the mmap lock downgrade. Note that
this option has better performance than upgrading to a mmap write lock
which would increase contention. Plus, mmap_write_trylock() has been
recently removed anyway.

Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
Cc: stable@vger.kernel.org
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20231201172212.1813387-3-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder_alloc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 138f6d43d13b2..9d2eff70c3bab 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1005,7 +1005,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 		goto err_mmget;
 	if (!mmap_read_trylock(mm))
 		goto err_mmap_read_lock_failed;
-	vma = binder_alloc_get_vma(alloc);
+	vma = vma_lookup(mm, page_addr);
+	if (vma && vma != binder_alloc_get_vma(alloc))
+		goto err_invalid_vma;
 
 	list_lru_isolate(lru, item);
 	spin_unlock(lock);
@@ -1031,6 +1033,8 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	mutex_unlock(&alloc->mutex);
 	return LRU_REMOVED_RETRY;
 
+err_invalid_vma:
+	mmap_read_unlock(mm);
 err_mmap_read_lock_failed:
 	mmput_async(mm);
 err_mmget:

From 220f3a317bb6f865a3ab8f078bf7bfa1eaf9f34a Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 1 Dec 2023 17:21:35 +0000
Subject: [PATCH 284/304] binder: fix trivial typo of binder_free_buf_locked()

commit 122a3c1cb0ff304c2b8934584fcfea4edb2fe5e3 upstream.

Fix minor misspelling of the function in the comment section.

No functional changes in this patch.

Cc: stable@vger.kernel.org
Fixes: 0f966cba95c7 ("binder: add flag to clear buffer on txn complete")
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20231201172212.1813387-7-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 9d2eff70c3bab..db340f0575a8b 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -706,7 +706,7 @@ void binder_alloc_free_buf(struct binder_alloc *alloc,
 	/*
 	 * We could eliminate the call to binder_alloc_clear_buf()
 	 * from binder_alloc_deferred_release() by moving this to
-	 * binder_alloc_free_buf_locked(). However, that could
+	 * binder_free_buf_locked(). However, that could
 	 * increase contention for the alloc mutex if clear_on_free
 	 * is used frequently for large buffers. The mutex is not
 	 * needed for correctness here.

From 9eae760e748ccb0512ef9a9836161cd3cc98f277 Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 1 Dec 2023 17:21:36 +0000
Subject: [PATCH 285/304] binder: fix comment on binder_alloc_new_buf() return
 value

commit e1090371e02b601cbfcea175c2a6cc7c955fa830 upstream.

Update the comments of binder_alloc_new_buf() to reflect that the return
value of the function is now ERR_PTR(-errno) on failure.

No functional changes in this patch.

Cc: stable@vger.kernel.org
Fixes: 57ada2fb2250 ("binder: add log information for binder transaction failures")
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20231201172212.1813387-8-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index db340f0575a8b..e5fa2042585a4 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -557,7 +557,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
  * is the sum of the three given sizes (each rounded up to
  * pointer-sized boundary)
  *
- * Return:	The allocated buffer or %NULL if error
+ * Return:	The allocated buffer or %ERR_PTR(-errno) if error
  */
 struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 					   size_t data_size,

From 913205930da6213305616ac539447702eaa85e41 Mon Sep 17 00:00:00 2001
From: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Date: Thu, 21 Dec 2023 17:57:43 +0800
Subject: [PATCH 286/304] uio: Fix use-after-free in uio_open

commit 0c9ae0b8605078eafc3bea053cc78791e97ba2e2 upstream.

core-1				core-2
-------------------------------------------------------
uio_unregister_device		uio_open
				idev = idr_find()
device_unregister(&idev->dev)
put_device(&idev->dev)
uio_device_release
				get_device(&idev->dev)
kfree(idev)
uio_free_minor(minor)
				uio_release
				put_device(&idev->dev)
				kfree(idev)
-------------------------------------------------------

In the core-1 uio_unregister_device(), the device_unregister will kfree
idev when the idev->dev kobject ref is 1. But after core-1
device_unregister, put_device and before doing kfree, the core-2 may
get_device. Then:
1. After core-1 kfree idev, the core-2 will do use-after-free for idev.
2. When core-2 do uio_release and put_device, the idev will be double
   freed.

To address this issue, we can get idev atomic & inc idev reference with
minor_lock.

Fixes: 57c5f4df0a5a ("uio: fix crash after the device is unregistered")
Cc: stable <stable@kernel.org>
Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://lore.kernel.org/r/1703152663-59949-1-git-send-email-guanghuifeng@linux.alibaba.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/uio/uio.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 62082d64ece00..2d572f6c8ec83 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -466,13 +466,13 @@ static int uio_open(struct inode *inode, struct file *filep)
 
 	mutex_lock(&minor_lock);
 	idev = idr_find(&uio_idr, iminor(inode));
-	mutex_unlock(&minor_lock);
 	if (!idev) {
 		ret = -ENODEV;
+		mutex_unlock(&minor_lock);
 		goto out;
 	}
-
 	get_device(&idev->dev);
+	mutex_unlock(&minor_lock);
 
 	if (!try_module_get(idev->owner)) {
 		ret = -ENODEV;
@@ -1064,9 +1064,8 @@ void uio_unregister_device(struct uio_info *info)
 	wake_up_interruptible(&idev->wait);
 	kill_fasync(&idev->async_queue, SIGIO, POLL_HUP);
 
-	device_unregister(&idev->dev);
-
 	uio_free_minor(minor);
+	device_unregister(&idev->dev);
 
 	return;
 }

From 2d4b32c0d4aba59b9ffd5998123fa831fd5d341d Mon Sep 17 00:00:00 2001
From: Cameron Williams <cang1@live.co.uk>
Date: Thu, 2 Nov 2023 21:07:05 +0000
Subject: [PATCH 287/304] parport: parport_serial: Add Brainboxes BAR details

commit 65fde134b0a4ffe838729f9ee11b459a2f6f2815 upstream.

Add BAR/enum entries for Brainboxes serial/parallel cards.

Cc:  <stable@vger.kernel.org>
Signed-off-by: Cameron Williams <cang1@live.co.uk>
Acked-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lore.kernel.org/r/AS4PR02MB79035155C2D5C3333AE6FA52C4A6A@AS4PR02MB7903.eurprd02.prod.outlook.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/parport/parport_serial.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/parport/parport_serial.c b/drivers/parport/parport_serial.c
index 9f5d784cd95d5..11989368611a1 100644
--- a/drivers/parport/parport_serial.c
+++ b/drivers/parport/parport_serial.c
@@ -65,6 +65,10 @@ enum parport_pc_pci_cards {
 	sunix_5069a,
 	sunix_5079a,
 	sunix_5099a,
+	brainboxes_uc257,
+	brainboxes_is300,
+	brainboxes_uc414,
+	brainboxes_px263,
 };
 
 /* each element directly indexed from enum list, above */
@@ -158,6 +162,10 @@ static struct parport_pc_pci cards[] = {
 	/* sunix_5069a */		{ 1, { { 1, 2 }, } },
 	/* sunix_5079a */		{ 1, { { 1, 2 }, } },
 	/* sunix_5099a */		{ 1, { { 1, 2 }, } },
+	/* brainboxes_uc257 */	{ 1, { { 3, -1 }, } },
+	/* brainboxes_is300 */	{ 1, { { 3, -1 }, } },
+	/* brainboxes_uc414 */  { 1, { { 3, -1 }, } },
+	/* brainboxes_px263 */	{ 1, { { 3, -1 }, } },
 };
 
 static struct pci_device_id parport_serial_pci_tbl[] = {

From dc927d3fa5ea6a86806c674a04b30aedd40a0c44 Mon Sep 17 00:00:00 2001
From: Cameron Williams <cang1@live.co.uk>
Date: Thu, 2 Nov 2023 21:07:06 +0000
Subject: [PATCH 288/304] parport: parport_serial: Add Brainboxes device IDs
 and geometry

commit 6aa1fc5a8085bbc01687aa708dcf2dbe637a5ee3 upstream.

Add device IDs for the Brainboxes UC-203, UC-257, UC-414, UC-475,
IS-300/IS-500 and PX-263/PX-295 and define the relevant "geometry"
for the cards.
This patch requires part 1 of this series.

Cc:  <stable@vger.kernel.org>
Signed-off-by: Cameron Williams <cang1@live.co.uk>
Acked-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lore.kernel.org/r/AS4PR02MB7903A4094564BE28F1F926A6C4A6A@AS4PR02MB7903.eurprd02.prod.outlook.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/parport/parport_serial.c | 56 ++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/drivers/parport/parport_serial.c b/drivers/parport/parport_serial.c
index 11989368611a1..3644997a83425 100644
--- a/drivers/parport/parport_serial.c
+++ b/drivers/parport/parport_serial.c
@@ -285,6 +285,38 @@ static struct pci_device_id parport_serial_pci_tbl[] = {
 	{ PCI_VENDOR_ID_SUNIX, PCI_DEVICE_ID_SUNIX_1999, PCI_VENDOR_ID_SUNIX,
 	  0x0104, 0, 0, sunix_5099a },
 
+	/* Brainboxes UC-203 */
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0bc1,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0bc2,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+
+	/* Brainboxes UC-257 */
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0861,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0862,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0863,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+
+	/* Brainboxes UC-414 */
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0e61,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc414 },
+
+	/* Brainboxes UC-475 */
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0981,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0982,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc257 },
+
+	/* Brainboxes IS-300/IS-500 */
+	{ PCI_VENDOR_ID_INTASHIELD, 0x0da0,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_is300 },
+
+	/* Brainboxes PX-263/PX-295 */
+	{ PCI_VENDOR_ID_INTASHIELD, 0x402c,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_px263 },
+
 	{ 0, } /* terminate list */
 };
 MODULE_DEVICE_TABLE(pci,parport_serial_pci_tbl);
@@ -550,6 +582,30 @@ static struct pciserial_board pci_parport_serial_boards[] = {
 		.base_baud      = 921600,
 		.uart_offset	= 0x8,
 	},
+	[brainboxes_uc257] = {
+		.flags		= FL_BASE2,
+		.num_ports	= 2,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
+	[brainboxes_is300] = {
+		.flags		= FL_BASE2,
+		.num_ports	= 1,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
+	[brainboxes_uc414] = {
+		.flags		= FL_BASE2,
+		.num_ports	= 4,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
+	[brainboxes_px263] = {
+		.flags		= FL_BASE2,
+		.num_ports	= 4,
+		.base_baud	= 921600,
+		.uart_offset	= 8,
+	},
 };
 
 struct parport_serial_private {

From 54a298fa028c8d8072532f19fbb54599c3492bc8 Mon Sep 17 00:00:00 2001
From: Florian Eckert <fe@dev.tdt.de>
Date: Mon, 27 Nov 2023 09:16:21 +0100
Subject: [PATCH 289/304] leds: ledtrig-tty: Free allocated ttyname buffer on
 deactivate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 25054b232681c286fca9c678854f56494d1352cc upstream.

The ttyname buffer for the ledtrig_tty_data struct is allocated in the
sysfs ttyname_store() function. This buffer must be released on trigger
deactivation. This was missing and is thus a memory leak.

While we are at it, the TTY handler in the ledtrig_tty_data struct should
also be returned in case of the trigger deactivation call.

Cc: stable@vger.kernel.org
Fixes: fd4a641ac88f ("leds: trigger: implement a tty trigger")
Signed-off-by: Florian Eckert <fe@dev.tdt.de>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20231127081621.774866-1-fe@dev.tdt.de
Signed-off-by: Lee Jones <lee@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/leds/trigger/ledtrig-tty.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/leds/trigger/ledtrig-tty.c b/drivers/leds/trigger/ledtrig-tty.c
index 8ae0d2d284aff..3e69a7bde9284 100644
--- a/drivers/leds/trigger/ledtrig-tty.c
+++ b/drivers/leds/trigger/ledtrig-tty.c
@@ -168,6 +168,10 @@ static void ledtrig_tty_deactivate(struct led_classdev *led_cdev)
 
 	cancel_delayed_work_sync(&trigger_data->dwork);
 
+	kfree(trigger_data->ttyname);
+	tty_kref_put(trigger_data->tty);
+	trigger_data->tty = NULL;
+
 	kfree(trigger_data);
 }
 

From 2498b0bbb736ac26aae799bf3d9582abab644d2b Mon Sep 17 00:00:00 2001
From: LeoLiuoc <LeoLiu-oc@zhaoxin.com>
Date: Mon, 11 Dec 2023 17:15:43 +0800
Subject: [PATCH 290/304] PCI: Add ACS quirk for more Zhaoxin Root Ports

commit e367e3c765f5477b2e79da0f1399aed49e2d1e37 upstream.

Add more Root Port Device IDs to pci_quirk_zhaoxin_pcie_ports_acs() for
some new Zhaoxin platforms.

Fixes: 299bd044a6f3 ("PCI: Add ACS quirk for Zhaoxin Root/Downstream Ports")
Link: https://lore.kernel.org/r/20231211091543.735903-1-LeoLiu-oc@zhaoxin.com
Signed-off-by: LeoLiuoc <LeoLiu-oc@zhaoxin.com>
[bhelgaas: update subject, drop changelog, add Fixes, add stable tag, fix
whitespace, wrap code comment]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: <stable@vger.kernel.org>	# 5.7
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/quirks.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index ea476252280ab..d55a3ffae4b8b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4699,17 +4699,21 @@ static int pci_quirk_xgene_acs(struct pci_dev *dev, u16 acs_flags)
  * But the implementation could block peer-to-peer transactions between them
  * and provide ACS-like functionality.
  */
-static int  pci_quirk_zhaoxin_pcie_ports_acs(struct pci_dev *dev, u16 acs_flags)
+static int pci_quirk_zhaoxin_pcie_ports_acs(struct pci_dev *dev, u16 acs_flags)
 {
 	if (!pci_is_pcie(dev) ||
 	    ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
 	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM)))
 		return -ENOTTY;
 
+	/*
+	 * Future Zhaoxin Root Ports and Switch Downstream Ports will
+	 * implement ACS capability in accordance with the PCIe Spec.
+	 */
 	switch (dev->device) {
 	case 0x0710 ... 0x071e:
 	case 0x0721:
-	case 0x0723 ... 0x0732:
+	case 0x0723 ... 0x0752:
 		return pci_acs_ctrl_enabled(acs_flags,
 			PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF);
 	}

From a6d3eb9d4ed5e561cfe7a7e3ce7c71357bd196b7 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Wed, 1 Nov 2023 11:52:06 +0000
Subject: [PATCH 291/304] coresight: etm4x: Fix width of CCITMIN field

commit cc0271a339cc70cae914c3ec20edc2a8058407da upstream.

CCITMIN is a 12 bit field and doesn't fit in a u8, so extend it to u16.
This probably wasn't an issue previously because values higher than 255
never occurred.

But since commit 4aff040bcc8d ("coresight: etm: Override TRCIDR3.CCITMIN
on errata affected cpus"), a comparison with 256 was done to enable the
errata, generating the following W=1 build error:

  coresight-etm4x-core.c:1188:24: error: result of comparison of
  constant 256 with expression of type 'u8' (aka 'unsigned char') is
  always false [-Werror,-Wtautological-constant-out-of-range-compare]

   if (drvdata->ccitmin == 256)

Cc: stable@vger.kernel.org
Fixes: 2e1cdfe184b5 ("coresight-etm4x: Adding CoreSight ETM4x driver")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310302043.as36UFED-lkp@intel.com/
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20231101115206.70810-1-james.clark@arm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm4x.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h
index 20e2e4cb76146..da17b6c49b0f1 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -1036,7 +1036,7 @@ struct etmv4_drvdata {
 	u8				ctxid_size;
 	u8				vmid_size;
 	u8				ccsize;
-	u8				ccitmin;
+	u16				ccitmin;
 	u8				s_ex_level;
 	u8				ns_ex_level;
 	u8				q_support;

From 2f2eb48bbe34f5b70d8da7d2e26b9222399d367f Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 29 Sep 2023 03:48:17 +0000
Subject: [PATCH 292/304] scripts/decode_stacktrace.sh: optionally use LLVM
 utilities

commit efbd6398353315b7018e6943e41fee9ec35e875f upstream.

GNU's addr2line can have problems parsing a vmlinux built with LLVM,
particularly when LTO was used.  In order to decode the traces correctly
this patch adds the ability to switch to LLVM's utilities readelf and
addr2line.  The same approach is followed by Will in [1].

Before:
  $ scripts/decode_stacktrace.sh vmlinux < kernel.log
  [17716.240635] Call trace:
  [17716.240646] skb_cow_data (??:?)
  [17716.240654] esp6_input (ld-temp.o:?)
  [17716.240666] xfrm_input (ld-temp.o:?)
  [17716.240674] xfrm6_rcv (??:?)
  [...]

After:
  $ LLVM=1 scripts/decode_stacktrace.sh vmlinux < kernel.log
  [17716.240635] Call trace:
  [17716.240646] skb_cow_data (include/linux/skbuff.h:2172 net/core/skbuff.c:4503)
  [17716.240654] esp6_input (net/ipv6/esp6.c:977)
  [17716.240666] xfrm_input (net/xfrm/xfrm_input.c:659)
  [17716.240674] xfrm6_rcv (net/ipv6/xfrm6_input.c:172)
  [...]

Note that one could set CROSS_COMPILE=llvm- instead to hack around this
issue.  However, doing so can break the decodecode routine as it will
force the selection of other LLVM utilities down the line e.g.  llvm-as.

[1] https://lore.kernel.org/all/20230914131225.13415-3-will@kernel.org/

Link: https://lkml.kernel.org/r/20230929034836.403735-1-cmllamas@google.com
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Elliot Berman <quic_eberman@quicinc.com>
Tested-by: Justin Stitt <justinstitt@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: John Stultz <jstultz@google.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tom Rix <trix@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 scripts/decode_stacktrace.sh | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 564c5632e1a24..bfe5a4082d8ea 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -16,6 +16,21 @@ elif type c++filt >/dev/null 2>&1 ; then
 	cppfilt_opts=-i
 fi
 
+UTIL_SUFFIX=
+if [[ -z ${LLVM:-} ]]; then
+	UTIL_PREFIX=${CROSS_COMPILE:-}
+else
+	UTIL_PREFIX=llvm-
+	if [[ ${LLVM} == */ ]]; then
+		UTIL_PREFIX=${LLVM}${UTIL_PREFIX}
+	elif [[ ${LLVM} == -* ]]; then
+		UTIL_SUFFIX=${LLVM}
+	fi
+fi
+
+READELF=${UTIL_PREFIX}readelf${UTIL_SUFFIX}
+ADDR2LINE=${UTIL_PREFIX}addr2line${UTIL_SUFFIX}
+
 if [[ $1 == "-r" ]] ; then
 	vmlinux=""
 	basepath="auto"
@@ -75,7 +90,7 @@ find_module() {
 
 	if [[ "$modpath" != "" ]] ; then
 		for fn in $(find "$modpath" -name "${module//_/[-_]}.ko*") ; do
-			if readelf -WS "$fn" | grep -qwF .debug_line ; then
+			if ${READELF} -WS "$fn" | grep -qwF .debug_line ; then
 				echo $fn
 				return
 			fi
@@ -169,7 +184,7 @@ parse_symbol() {
 	if [[ $aarray_support == true && "${cache[$module,$address]+isset}" == "isset" ]]; then
 		local code=${cache[$module,$address]}
 	else
-		local code=$(${CROSS_COMPILE}addr2line -i -e "$objfile" "$address" 2>/dev/null)
+		local code=$(${ADDR2LINE} -i -e "$objfile" "$address" 2>/dev/null)
 		if [[ $aarray_support == true ]]; then
 			cache[$module,$address]=$code
 		fi

From f7f4ff54c440c5324229e5ee204698f95487ae73 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Wed, 10 Jan 2024 18:47:58 +0100
Subject: [PATCH 293/304] docs: kernel_feat.py: fix potential command injection

commit c48a7c44a1d02516309015b6134c9bb982e17008 upstream.

The kernel-feat directive passes its argument straight to the shell.
This is unfortunate and unnecessary.

Let's always use paths relative to $srctree/Documentation/ and use
subprocess.check_call() instead of subprocess.Popen(shell=True).

This also makes the code shorter.

This is analogous to commit 3231dd586277 ("docs: kernel_abi.py: fix
command injection") where we did exactly the same thing for
kernel_abi.py, somehow I completely missed this one.

Link: https://fosstodon.org/@jani/111676532203641247
Reported-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20240110174758.3680506-1-vegard.nossum@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/features.rst        |  2 +-
 Documentation/arch/arc/features.rst           |  2 +-
 Documentation/arch/arm/features.rst           |  2 +-
 Documentation/arch/arm64/features.rst         |  2 +-
 Documentation/arch/loongarch/features.rst     |  2 +-
 Documentation/arch/m68k/features.rst          |  2 +-
 Documentation/arch/mips/features.rst          |  2 +-
 Documentation/arch/nios2/features.rst         |  2 +-
 Documentation/arch/openrisc/features.rst      |  2 +-
 Documentation/arch/parisc/features.rst        |  2 +-
 Documentation/arch/powerpc/features.rst       |  2 +-
 Documentation/arch/riscv/features.rst         |  2 +-
 Documentation/arch/s390/features.rst          |  2 +-
 Documentation/arch/sh/features.rst            |  2 +-
 Documentation/arch/sparc/features.rst         |  2 +-
 Documentation/arch/x86/features.rst           |  2 +-
 Documentation/arch/xtensa/features.rst        |  2 +-
 Documentation/sphinx/kernel_feat.py           | 55 ++++---------------
 .../zh_CN/arch/loongarch/features.rst         |  2 +-
 .../translations/zh_CN/arch/mips/features.rst |  2 +-
 .../zh_TW/arch/loongarch/features.rst         |  2 +-
 .../translations/zh_TW/arch/mips/features.rst |  2 +-
 22 files changed, 32 insertions(+), 65 deletions(-)

diff --git a/Documentation/admin-guide/features.rst b/Documentation/admin-guide/features.rst
index 8c167082a84f9..7651eca38227d 100644
--- a/Documentation/admin-guide/features.rst
+++ b/Documentation/admin-guide/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features
+.. kernel-feat:: features
diff --git a/Documentation/arch/arc/features.rst b/Documentation/arch/arc/features.rst
index b793583d688a4..49ff446ff744c 100644
--- a/Documentation/arch/arc/features.rst
+++ b/Documentation/arch/arc/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features arc
+.. kernel-feat:: features arc
diff --git a/Documentation/arch/arm/features.rst b/Documentation/arch/arm/features.rst
index 7414ec03dd157..0e76aaf68ecab 100644
--- a/Documentation/arch/arm/features.rst
+++ b/Documentation/arch/arm/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features arm
+.. kernel-feat:: features arm
diff --git a/Documentation/arch/arm64/features.rst b/Documentation/arch/arm64/features.rst
index dfa4cb3cd3efa..03321f4309d0b 100644
--- a/Documentation/arch/arm64/features.rst
+++ b/Documentation/arch/arm64/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features arm64
+.. kernel-feat:: features arm64
diff --git a/Documentation/arch/loongarch/features.rst b/Documentation/arch/loongarch/features.rst
index ebacade3ea454..009f44c7951f8 100644
--- a/Documentation/arch/loongarch/features.rst
+++ b/Documentation/arch/loongarch/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features loongarch
+.. kernel-feat:: features loongarch
diff --git a/Documentation/arch/m68k/features.rst b/Documentation/arch/m68k/features.rst
index 5107a21194724..de7f0ccf7fc8e 100644
--- a/Documentation/arch/m68k/features.rst
+++ b/Documentation/arch/m68k/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features m68k
+.. kernel-feat:: features m68k
diff --git a/Documentation/arch/mips/features.rst b/Documentation/arch/mips/features.rst
index 1973d729b29a9..6e0ffe3e73540 100644
--- a/Documentation/arch/mips/features.rst
+++ b/Documentation/arch/mips/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features mips
+.. kernel-feat:: features mips
diff --git a/Documentation/arch/nios2/features.rst b/Documentation/arch/nios2/features.rst
index 8449e63f69b2b..89913810ccb5a 100644
--- a/Documentation/arch/nios2/features.rst
+++ b/Documentation/arch/nios2/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features nios2
+.. kernel-feat:: features nios2
diff --git a/Documentation/arch/openrisc/features.rst b/Documentation/arch/openrisc/features.rst
index 3f7c40d219f2c..bae2e25adfd64 100644
--- a/Documentation/arch/openrisc/features.rst
+++ b/Documentation/arch/openrisc/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features openrisc
+.. kernel-feat:: features openrisc
diff --git a/Documentation/arch/parisc/features.rst b/Documentation/arch/parisc/features.rst
index 501d7c4500379..b3aa4d243b936 100644
--- a/Documentation/arch/parisc/features.rst
+++ b/Documentation/arch/parisc/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features parisc
+.. kernel-feat:: features parisc
diff --git a/Documentation/arch/powerpc/features.rst b/Documentation/arch/powerpc/features.rst
index aeae73df86b0c..ee4b95e04202d 100644
--- a/Documentation/arch/powerpc/features.rst
+++ b/Documentation/arch/powerpc/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features powerpc
+.. kernel-feat:: features powerpc
diff --git a/Documentation/arch/riscv/features.rst b/Documentation/arch/riscv/features.rst
index c70ef6ac2368c..36e90144adabd 100644
--- a/Documentation/arch/riscv/features.rst
+++ b/Documentation/arch/riscv/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features riscv
+.. kernel-feat:: features riscv
diff --git a/Documentation/arch/s390/features.rst b/Documentation/arch/s390/features.rst
index 57c296a9d8f30..2883dc9506817 100644
--- a/Documentation/arch/s390/features.rst
+++ b/Documentation/arch/s390/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features s390
+.. kernel-feat:: features s390
diff --git a/Documentation/arch/sh/features.rst b/Documentation/arch/sh/features.rst
index f722af3b6c993..fae48fe81e9bd 100644
--- a/Documentation/arch/sh/features.rst
+++ b/Documentation/arch/sh/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features sh
+.. kernel-feat:: features sh
diff --git a/Documentation/arch/sparc/features.rst b/Documentation/arch/sparc/features.rst
index c0c92468b0fe9..96835b6d598a1 100644
--- a/Documentation/arch/sparc/features.rst
+++ b/Documentation/arch/sparc/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features sparc
+.. kernel-feat:: features sparc
diff --git a/Documentation/arch/x86/features.rst b/Documentation/arch/x86/features.rst
index b663f15053ce8..a33616346a388 100644
--- a/Documentation/arch/x86/features.rst
+++ b/Documentation/arch/x86/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features x86
+.. kernel-feat:: features x86
diff --git a/Documentation/arch/xtensa/features.rst b/Documentation/arch/xtensa/features.rst
index 6b92c7bfa19da..28dcce1759be4 100644
--- a/Documentation/arch/xtensa/features.rst
+++ b/Documentation/arch/xtensa/features.rst
@@ -1,3 +1,3 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-.. kernel-feat:: $srctree/Documentation/features xtensa
+.. kernel-feat:: features xtensa
diff --git a/Documentation/sphinx/kernel_feat.py b/Documentation/sphinx/kernel_feat.py
index b5fa2f0542a5d..b9df61eb45013 100644
--- a/Documentation/sphinx/kernel_feat.py
+++ b/Documentation/sphinx/kernel_feat.py
@@ -37,8 +37,6 @@
 import subprocess
 import sys
 
-from os import path
-
 from docutils import nodes, statemachine
 from docutils.statemachine import ViewList
 from docutils.parsers.rst import directives, Directive
@@ -76,33 +74,26 @@ def warn(self, message, **replace):
         self.state.document.settings.env.app.warn(message, prefix="")
 
     def run(self):
-
         doc = self.state.document
         if not doc.settings.file_insertion_enabled:
             raise self.warning("docutils: file insertion disabled")
 
         env = doc.settings.env
-        cwd = path.dirname(doc.current_source)
-        cmd = "get_feat.pl rest --enable-fname --dir "
-        cmd += self.arguments[0]
-
-        if len(self.arguments) > 1:
-            cmd += " --arch " + self.arguments[1]
 
-        srctree = path.abspath(os.environ["srctree"])
+        srctree = os.path.abspath(os.environ["srctree"])
 
-        fname = cmd
+        args = [
+            os.path.join(srctree, 'scripts/get_feat.pl'),
+            'rest',
+            '--enable-fname',
+            '--dir',
+            os.path.join(srctree, 'Documentation', self.arguments[0]),
+        ]
 
-        # extend PATH with $(srctree)/scripts
-        path_env = os.pathsep.join([
-            srctree + os.sep + "scripts",
-            os.environ["PATH"]
-        ])
-        shell_env = os.environ.copy()
-        shell_env["PATH"]    = path_env
-        shell_env["srctree"] = srctree
+        if len(self.arguments) > 1:
+            args.extend(['--arch', self.arguments[1]])
 
-        lines = self.runCmd(cmd, shell=True, cwd=cwd, env=shell_env)
+        lines = subprocess.check_output(args, cwd=os.path.dirname(doc.current_source)).decode('utf-8')
 
         line_regex = re.compile(r"^\.\. FILE (\S+)$")
 
@@ -121,30 +112,6 @@ def run(self):
         nodeList = self.nestedParse(out_lines, fname)
         return nodeList
 
-    def runCmd(self, cmd, **kwargs):
-        u"""Run command ``cmd`` and return its stdout as unicode."""
-
-        try:
-            proc = subprocess.Popen(
-                cmd
-                , stdout = subprocess.PIPE
-                , stderr = subprocess.PIPE
-                , **kwargs
-            )
-            out, err = proc.communicate()
-
-            out, err = codecs.decode(out, 'utf-8'), codecs.decode(err, 'utf-8')
-
-            if proc.returncode != 0:
-                raise self.severe(
-                    u"command '%s' failed with return code %d"
-                    % (cmd, proc.returncode)
-                )
-        except OSError as exc:
-            raise self.severe(u"problems with '%s' directive: %s."
-                              % (self.name, ErrorString(exc)))
-        return out
-
     def nestedParse(self, lines, fname):
         content = ViewList()
         node    = nodes.section()
diff --git a/Documentation/translations/zh_CN/arch/loongarch/features.rst b/Documentation/translations/zh_CN/arch/loongarch/features.rst
index 82bfac180bdc0..cec38dda8298c 100644
--- a/Documentation/translations/zh_CN/arch/loongarch/features.rst
+++ b/Documentation/translations/zh_CN/arch/loongarch/features.rst
@@ -5,4 +5,4 @@
 :Original: Documentation/arch/loongarch/features.rst
 :Translator: Huacai Chen <chenhuacai@loongson.cn>
 
-.. kernel-feat:: $srctree/Documentation/features loongarch
+.. kernel-feat:: features loongarch
diff --git a/Documentation/translations/zh_CN/arch/mips/features.rst b/Documentation/translations/zh_CN/arch/mips/features.rst
index da1b956e4a40f..0d6df97db069b 100644
--- a/Documentation/translations/zh_CN/arch/mips/features.rst
+++ b/Documentation/translations/zh_CN/arch/mips/features.rst
@@ -10,4 +10,4 @@
 
 .. _cn_features:
 
-.. kernel-feat:: $srctree/Documentation/features mips
+.. kernel-feat:: features mips
diff --git a/Documentation/translations/zh_TW/arch/loongarch/features.rst b/Documentation/translations/zh_TW/arch/loongarch/features.rst
index b64e430f55aef..c2175fd32b54b 100644
--- a/Documentation/translations/zh_TW/arch/loongarch/features.rst
+++ b/Documentation/translations/zh_TW/arch/loongarch/features.rst
@@ -5,5 +5,5 @@
 :Original: Documentation/arch/loongarch/features.rst
 :Translator: Huacai Chen <chenhuacai@loongson.cn>
 
-.. kernel-feat:: $srctree/Documentation/features loongarch
+.. kernel-feat:: features loongarch
 
diff --git a/Documentation/translations/zh_TW/arch/mips/features.rst b/Documentation/translations/zh_TW/arch/mips/features.rst
index f694104200354..3d3906c4d08e2 100644
--- a/Documentation/translations/zh_TW/arch/mips/features.rst
+++ b/Documentation/translations/zh_TW/arch/mips/features.rst
@@ -10,5 +10,5 @@
 
 .. _tw_features:
 
-.. kernel-feat:: $srctree/Documentation/features mips
+.. kernel-feat:: features mips
 

From 0f91df0c0fae1a88f11be3eabb1f20f73a88c90a Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Wed, 10 Jan 2024 15:01:27 +0100
Subject: [PATCH 294/304] mm/memory_hotplug: fix memmap_on_memory sysfs value
 retrieval

commit 11684134140bb708b6e6de969a060535630b1b53 upstream.

set_memmap_mode() stores the kernel parameter memmap mode as an integer.
However, the get_memmap_mode() function utilizes param_get_bool() to fetch
the value as a boolean, leading to potential endianness issue.  On
Big-endian architectures, the memmap_on_memory is consistently displayed
as 'N' regardless of its actual status.

To address this endianness problem, the solution involves obtaining the
mode as an integer.  This adjustment ensures the proper display of the
memmap_on_memory parameter, presenting it as one of the following options:
Force, Y, or N.

Link: https://lkml.kernel.org/r/20240110140127.241451-1-sumanthk@linux.ibm.com
Fixes: 2d1f649c7c08 ("mm/memory_hotplug: support memmap_on_memory when memmap is not aligned to pageblocks")
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Suggested-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: <stable@vger.kernel.org>	[6.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory_hotplug.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7a5fc89a86528..c9c2ad5e2681e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -101,9 +101,11 @@ static int set_memmap_mode(const char *val, const struct kernel_param *kp)
 
 static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
 {
-	if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
-		return sprintf(buffer,  "force\n");
-	return param_get_bool(buffer, kp);
+	int mode = *((int *)kp->arg);
+
+	if (mode == MEMMAP_ON_MEMORY_FORCE)
+		return sprintf(buffer, "force\n");
+	return sprintf(buffer, "%c\n", mode ? 'Y' : 'N');
 }
 
 static const struct kernel_param_ops memmap_mode_ops = {

From a91fdae50a6d65ee57378d31284ddec7e9a7ba1b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 20 Jan 2024 11:53:02 +0100
Subject: [PATCH 295/304] Linux 6.7.1

Link: https://lore.kernel.org/r/20240118104301.249503558@linuxfoundation.org
Tested-by: Ronald Warsow <rwarsow@gmx.de>
Tested-by: Allen Pais <apais@linux.microsoft.com>
Tested-by: Florian Fainelli <florian.fainelli@broadcom.com>
Tested-by: SeongJae Park <sj@kernel.org>
Tested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Ron Economos <re@w6rz.net>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Tested-by: Ricardo B. Marliere <ricardo@marliere.net>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Luna Jernberg <droidbittin@gmail.com>
Tested-by: Justin M. Forbes <jforbes@fedoraproject.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c6f549f6a4aeb..186da2386a067 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 6
 PATCHLEVEL = 7
-SUBLEVEL = 0
+SUBLEVEL = 1
 EXTRAVERSION =
 NAME = Hurr durr I'ma ninja sloth
 

From 26ae1b03568b1e8a8134dd7e58c42ef9b8a5b9e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2024 21:02:02 -1000
Subject: [PATCH 296/304] scx: Make scx_exit_info fields dynamically allocated

scx_exit_info currently embeds all message buffers. This isn't great in that
it makes the size of the structs a part of the ABI and wastes memory when
scx is not in use. As the contents are accessed with
bpf_probe_read_kernel_str(), the buffers can be moved outside the struct.

This change requires the scx scheduler to be rebuilt but doesn't require
code changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 17 ++++-----
 kernel/sched/ext.c        | 80 +++++++++++++++++++++++++++++----------
 2 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index ae552129931a9..f4870bd5cd073 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -18,9 +18,6 @@ struct cgroup;
 
 enum scx_consts {
 	SCX_OPS_NAME_LEN	= 128,
-	SCX_EXIT_REASON_LEN	= 128,
-	SCX_EXIT_BT_LEN		= 64,
-	SCX_EXIT_MSG_LEN	= 1024,
 
 	SCX_SLICE_DFL		= 20 * NSEC_PER_MSEC,
 	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
@@ -74,14 +71,16 @@ enum scx_exit_kind {
 struct scx_exit_info {
 	/* %SCX_EXIT_* - broad category of the exit reason */
 	enum scx_exit_kind	kind;
+
 	/* textual representation of the above */
-	char			reason[SCX_EXIT_REASON_LEN];
-	/* number of entries in the backtrace */
-	u32			bt_len;
+	const char		*reason;
+
 	/* backtrace if exiting due to an error */
-	unsigned long		bt[SCX_EXIT_BT_LEN];
-	/* extra message */
-	char			msg[SCX_EXIT_MSG_LEN];
+	unsigned long		*bt;
+	u32			bt_len;
+
+	/* informational message */
+	char			*msg;
 };
 
 /* sched_ext_ops.flags */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 52457f65aa572..4373dac429ea3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -18,6 +18,9 @@ enum scx_internal_consts {
 	SCX_DSP_DFL_MAX_BATCH		= 32,
 	SCX_DSP_MAX_LOOPS		= 32,
 	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
+
+	SCX_EXIT_BT_LEN			= 64,
+	SCX_EXIT_MSG_LEN		= 1024,
 };
 
 enum scx_ops_enable_state {
@@ -113,7 +116,7 @@ struct static_key_false scx_has_op[SCX_OPI_END] =
 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 
 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info scx_exit_info;
+static struct scx_exit_info *scx_exit_info;
 
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 
@@ -3207,14 +3210,39 @@ static void scx_ops_bypass(bool bypass)
 	}
 }
 
+static void free_exit_info(struct scx_exit_info *ei)
+{
+	kfree(ei->msg);
+	kfree(ei->bt);
+	kfree(ei);
+}
+
+static struct scx_exit_info *alloc_exit_info(void)
+{
+	struct scx_exit_info *ei;
+
+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	if (!ei)
+		return NULL;
+
+	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
+	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+
+	if (!ei->bt || !ei->msg) {
+		free_exit_info(ei);
+		return NULL;
+	}
+
+	return ei;
+}
+
 static void scx_ops_disable_workfn(struct kthread_work *work)
 {
-	struct scx_exit_info *ei = &scx_exit_info;
+	struct scx_exit_info *ei = scx_exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
-	const char *reason;
 	int i, kind;
 
 	kind = atomic_read(&scx_exit_kind);
@@ -3229,32 +3257,30 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
+	ei->kind = kind;
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
-	switch (kind) {
+	switch (ei->kind) {
 	case SCX_EXIT_UNREG:
-		reason = "BPF scheduler unregistered";
+		ei->reason = "BPF scheduler unregistered";
 		break;
 	case SCX_EXIT_SYSRQ:
-		reason = "disabled by sysrq-S";
+		ei->reason = "disabled by sysrq-S";
 		break;
 	case SCX_EXIT_ERROR:
-		reason = "runtime error";
+		ei->reason = "runtime error";
 		break;
 	case SCX_EXIT_ERROR_BPF:
-		reason = "scx_bpf_error";
+		ei->reason = "scx_bpf_error";
 		break;
 	case SCX_EXIT_ERROR_STALL:
-		reason = "runnable task stall";
+		ei->reason = "runnable task stall";
 		break;
 	default:
-		reason = "<UNKNOWN>";
+		ei->reason = "<UNKNOWN>";
 	}
 
-	ei->kind = kind;
-	strlcpy(ei->reason, reason, sizeof(ei->reason));
-
 	/* guarantee forward progress by bypassing scx_ops */
 	scx_ops_bypass(true);
 
@@ -3264,7 +3290,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		break;
 	case SCX_OPS_DISABLED:
 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-			scx_exit_info.msg);
+			scx_exit_info->msg);
 		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
 			     SCX_OPS_DISABLING);
 		goto done;
@@ -3365,6 +3391,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_dsp_buf = NULL;
 	scx_dsp_max_batch = 0;
 
+	free_exit_info(scx_exit_info);
+	scx_exit_info = NULL;
+
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
@@ -3411,17 +3440,17 @@ static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
 __printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
 				       const char *fmt, ...)
 {
-	struct scx_exit_info *ei = &scx_exit_info;
+	struct scx_exit_info *ei = scx_exit_info;
 	int none = SCX_EXIT_NONE;
 	va_list args;
 
 	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
 		return;
 
-	ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1);
+	ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
 
 	va_start(args, fmt);
-	vscnprintf(ei->msg, ARRAY_SIZE(ei->msg), fmt, args);
+	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);
 
 	irq_work_queue(&scx_ops_error_irq_work);
@@ -3474,6 +3503,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err;
 	}
 
+	scx_exit_info = alloc_exit_info();
+	if (!scx_exit_info) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
 	if (!scx_root_kobj) {
 		ret = -ENOMEM;
@@ -3494,7 +3529,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
 		     SCX_OPS_DISABLED);
 
-	memset(&scx_exit_info, 0, sizeof(scx_exit_info));
 	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
 	scx_warned_zero_slice = false;
 
@@ -3706,8 +3740,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	return 0;
 
 err:
-	kfree(scx_root_kobj);
-	scx_root_kobj = NULL;
+	if (scx_root_kobj) {
+		kfree(scx_root_kobj);
+		scx_root_kobj = NULL;
+	}
+	if (scx_exit_info) {
+		free_exit_info(scx_exit_info);
+		scx_exit_info = NULL;
+	}
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 

From 47ae2067390ed42978e7c2dbd6016d31abd738ac Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2024 21:02:02 -1000
Subject: [PATCH 297/304] scx: Misc scx_exit_info related updates

- jiffies delta -> msecs conversion wasn't quite correct. Fix it.

- Factor out scx_exit_kind -> reason string mapping.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 51 ++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4373dac429ea3..76dd1a097259b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -213,6 +213,14 @@ struct scx_task_iter {
 
 #define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
 
+static long jiffies_delta_msecs(unsigned long at, unsigned long now)
+{
+	if (time_after(at, now))
+		return jiffies_to_msecs(at - now);
+	else
+		return -(long)jiffies_to_msecs(now - at);
+}
+
 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */
 static u32 higher_bits(u32 flags)
 {
@@ -3236,6 +3244,24 @@ static struct scx_exit_info *alloc_exit_info(void)
 	return ei;
 }
 
+static const char *scx_exit_reason(enum scx_exit_kind kind)
+{
+	switch (kind) {
+	case SCX_EXIT_UNREG:
+		return "BPF scheduler unregistered";
+	case SCX_EXIT_SYSRQ:
+		return "disabled by sysrq-S";
+	case SCX_EXIT_ERROR:
+		return "runtime error";
+	case SCX_EXIT_ERROR_BPF:
+		return "scx_bpf_error";
+	case SCX_EXIT_ERROR_STALL:
+		return "runnable task stall";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
 static void scx_ops_disable_workfn(struct kthread_work *work)
 {
 	struct scx_exit_info *ei = scx_exit_info;
@@ -3258,29 +3284,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 			break;
 	}
 	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
-	switch (ei->kind) {
-	case SCX_EXIT_UNREG:
-		ei->reason = "BPF scheduler unregistered";
-		break;
-	case SCX_EXIT_SYSRQ:
-		ei->reason = "disabled by sysrq-S";
-		break;
-	case SCX_EXIT_ERROR:
-		ei->reason = "runtime error";
-		break;
-	case SCX_EXIT_ERROR_BPF:
-		ei->reason = "scx_bpf_error";
-		break;
-	case SCX_EXIT_ERROR_STALL:
-		ei->reason = "runnable task stall";
-		break;
-	default:
-		ei->reason = "<UNKNOWN>";
-	}
-
 	/* guarantee forward progress by bypassing scx_ops */
 	scx_ops_bypass(true);
 
@@ -4115,8 +4122,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 
 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
 				      sizeof(runnable_at)))
-		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms",
-			  (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC));
+		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
+			  jiffies_delta_msecs(runnable_at, jiffies));
 
 	/* print everything onto one line to conserve console space */
 	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",

From 986e8e7092abff706c769341acee8922a48f2a92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2024 21:02:02 -1000
Subject: [PATCH 298/304] scx: Dump debug info after an abort

When a scx scheduler gets aborted, it's difficult to tell what the system
was doing after the fact as normal operation is restored by reverting to the
default scheduler. Let's capture runqueue and runnable task states in a
debug dump buffer to aid debugging.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h   |   3 +
 kernel/sched/build_policy.c |   1 +
 kernel/sched/ext.c          | 110 +++++++++++++++++++++++++++++++++++-
 3 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index f4870bd5cd073..6d30ed942650f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -81,6 +81,9 @@ struct scx_exit_info {
 
 	/* informational message */
 	char			*msg;
+
+	/* debug dump */
+	char			*dump;
 };
 
 /* sched_ext_ops.flags */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 392c91667767d..e0e73b44afe98 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -25,6 +25,7 @@
 #include <linux/livepatch.h>
 #include <linux/pm.h>
 #include <linux/psi.h>
+#include <linux/seq_buf.h>
 #include <linux/seqlock_api.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 76dd1a097259b..1d5551c41614a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -21,6 +21,7 @@ enum scx_internal_consts {
 
 	SCX_EXIT_BT_LEN			= 64,
 	SCX_EXIT_MSG_LEN		= 1024,
+	SCX_EXIT_DUMP_LEN		= 32768,
 };
 
 enum scx_ops_enable_state {
@@ -3220,6 +3221,7 @@ static void scx_ops_bypass(bool bypass)
 
 static void free_exit_info(struct scx_exit_info *ei)
 {
+	kfree(ei->dump);
 	kfree(ei->msg);
 	kfree(ei->bt);
 	kfree(ei);
@@ -3235,8 +3237,9 @@ static struct scx_exit_info *alloc_exit_info(void)
 
 	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+	ei->dump = kzalloc(SCX_EXIT_DUMP_LEN, GFP_KERNEL);
 
-	if (!ei->bt || !ei->msg) {
+	if (!ei->bt || !ei->msg || !ei->dump) {
 		free_exit_info(ei);
 		return NULL;
 	}
@@ -3437,8 +3440,106 @@ static void scx_ops_disable(enum scx_exit_kind kind)
 	schedule_scx_ops_disable_work();
 }
 
+static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker,
+			  unsigned long now)
+{
+	static unsigned long bt[SCX_EXIT_BT_LEN];
+	char dsq_id_buf[19] = "(n/a)";
+	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+	unsigned int bt_len;
+	size_t avail, used;
+	char *buf;
+
+	if (p->scx.dsq)
+		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+			  (unsigned long long)p->scx.dsq->id);
+
+	seq_buf_printf(s, "\n %c%c %-16s: pid=%d state/flags=%u/0x%x dsq_flags=0x%x\n",
+		       marker, task_state_to_char(p), p->comm, p->pid,
+		       scx_get_task_state(p),
+		       p->scx.flags & ~SCX_TASK_STATE_MASK,
+		       p->scx.dsq_flags);
+	seq_buf_printf(s, "%*sops_state/qseq=%lu/%lu run_at=%+ldms\n", 22, "",
+		       ops_state & SCX_OPSS_STATE_MASK,
+		       ops_state >> SCX_OPSS_QSEQ_SHIFT,
+		       jiffies_delta_msecs(p->scx.runnable_at, now));
+	seq_buf_printf(s, "%*sdsq_id=%s sticky/holding_cpu=%d/%d\n", 22, "",
+		       dsq_id_buf, p->scx.sticky_cpu, p->scx.holding_cpu);
+
+	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+
+	avail = seq_buf_get_buf(s, &buf);
+	used = stack_trace_snprint(buf, avail, bt, bt_len, 3);
+	seq_buf_commit(s, used < avail ? used : -1);
+}
+
+static void scx_dump_state(struct scx_exit_info *ei)
+{
+	const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+	unsigned long now = jiffies;
+	struct seq_buf s;
+	size_t avail, used;
+	char *buf;
+	int cpu;
+
+	seq_buf_init(&s, ei->dump, SCX_EXIT_DUMP_LEN - sizeof(trunc_marker));
+
+	seq_buf_printf(&s, "%s[%d] triggered exit kind %d:\n  %s (%s)\n\n",
+		       current->comm, current->pid, ei->kind, ei->reason, ei->msg);
+	seq_buf_printf(&s, "Backtrace:\n");
+	avail = seq_buf_get_buf(&s, &buf);
+	used = stack_trace_snprint(buf, avail, ei->bt, ei->bt_len, 1);
+	seq_buf_commit(&s, used < avail ? used : -1);
+
+	seq_buf_printf(&s, "\nRunqueue states\n");
+	seq_buf_printf(&s, "---------------\n");
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct rq_flags rf;
+		struct task_struct *p;
+
+		rq_lock(rq, &rf);
+
+		if (list_empty(&rq->scx.runnable_list) &&
+		    rq->curr->sched_class == &idle_sched_class)
+			goto next;
+
+		seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu\n",
+			       cpu, rq->scx.nr_running, rq->scx.flags,
+			       rq->scx.cpu_released, rq->scx.ops_qseq,
+			       rq->scx.pnt_seq);
+		seq_buf_printf(&s, "          curr=%s[%d] class=%ps\n",
+			       rq->curr->comm, rq->curr->pid,
+			       rq->curr->sched_class);
+		if (!cpumask_empty(rq->scx.cpus_to_kick))
+			seq_buf_printf(&s, "  cpus_to_kick   : %*pb\n",
+				       cpumask_pr_args(rq->scx.cpus_to_kick));
+		if (!cpumask_empty(rq->scx.cpus_to_preempt))
+			seq_buf_printf(&s, "  cpus_to_preempt: %*pb\n",
+				       cpumask_pr_args(rq->scx.cpus_to_preempt));
+		if (!cpumask_empty(rq->scx.cpus_to_wait))
+			seq_buf_printf(&s, "  cpus_to_wait   : %*pb\n",
+				       cpumask_pr_args(rq->scx.cpus_to_wait));
+
+		if (rq->curr->sched_class == &ext_sched_class)
+			scx_dump_task(&s, rq->curr, '*', now);
+
+		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+			scx_dump_task(&s, p, ' ', now);
+	next:
+		rq_unlock(rq, &rf);
+	}
+
+	if (seq_buf_has_overflowed(&s)) {
+		used = strlen(seq_buf_str(&s));
+		memcpy(ei->dump + used, trunc_marker, sizeof(trunc_marker));
+	}
+}
+
 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 {
+	scx_dump_state(scx_exit_info);
 	schedule_scx_ops_disable_work();
 }
 
@@ -3460,6 +3561,13 @@ __printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);
 
+	/*
+	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+	 * in scx_ops_disable_workfn().
+	 */
+	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
+
 	irq_work_queue(&scx_ops_error_irq_work);
 }
 

From 5046ce874ad20b3230028be71fbb6632895ce4a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 23 Jan 2024 07:59:33 -1000
Subject: [PATCH 299/304] scx: rq should be locked when calling
 scx_ops_exit_task() from scx_cancel_fork()

scx_cancel_fork() was calling scx_ops_exit_task() without acquring rq lock.
Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1d5551c41614a..e45a3058d2cff 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2524,8 +2524,13 @@ void scx_post_fork(struct task_struct *p)
 void scx_cancel_fork(struct task_struct *p)
 {
 	if (scx_enabled()) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		rq = task_rq_lock(p, &rf);
 		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
 		scx_ops_exit_task(p);
+		task_rq_unlock(rq, p, &rf);
 	}
 	percpu_up_read(&scx_fork_rwsem);
 }

From 7df004e27bf33c1f0b16124f3114a42526170175 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2024 11:48:02 -1000
Subject: [PATCH 300/304] Revert "kernfs: convert kernfs_idr_lock to an irq
 safe raw spinlock"

This reverts commit dad3fb67ca1cbef87ce700e83a55835e5921ce8a.

The commit converted kernfs_idr_lock to an IRQ-safe raw_spinlock because it
could be acquired while holding an rq lock through bpf_cgroup_from_id().
However, kernfs_idr_lock is held while doing GPF_NOWAIT allocations which
involves acquiring an non-IRQ-safe and non-raw lock leading to the following
lockdep warning:

  =============================
  [ BUG: Invalid wait context ]
  6.7.0-rc5-kzm9g-00251-g655022a45b1c #578 Not tainted
  -----------------------------
  swapper/0/0 is trying to lock:
  dfbcd488 (&c->lock){....}-{3:3}, at: local_lock_acquire+0x0/0xa4
  other info that might help us debug this:
  context-{5:5}
  2 locks held by swapper/0/0:
   #0: dfbc9c60 (lock){+.+.}-{3:3}, at: local_lock_acquire+0x0/0xa4
   #1: c0c012a8 (kernfs_idr_lock){....}-{2:2}, at: __kernfs_new_node.constprop.0+0x68/0x258
  stack backtrace:
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.7.0-rc5-kzm9g-00251-g655022a45b1c #578
  Hardware name: Generic SH73A0 (Flattened Device Tree)
   unwind_backtrace from show_stack+0x10/0x14
   show_stack from dump_stack_lvl+0x68/0x90
   dump_stack_lvl from __lock_acquire+0x3cc/0x168c
   __lock_acquire from lock_acquire+0x274/0x30c
   lock_acquire from local_lock_acquire+0x28/0xa4
   local_lock_acquire from ___slab_alloc+0x234/0x8a8
   ___slab_alloc from __slab_alloc.constprop.0+0x30/0x44
   __slab_alloc.constprop.0 from kmem_cache_alloc+0x7c/0x148
   kmem_cache_alloc from radix_tree_node_alloc.constprop.0+0x44/0xdc
   radix_tree_node_alloc.constprop.0 from idr_get_free+0x110/0x2b8
   idr_get_free from idr_alloc_u32+0x9c/0x108
   idr_alloc_u32 from idr_alloc_cyclic+0x50/0xb8
   idr_alloc_cyclic from __kernfs_new_node.constprop.0+0x88/0x258
   __kernfs_new_node.constprop.0 from kernfs_create_root+0xbc/0x154
   kernfs_create_root from sysfs_init+0x18/0x5c
   sysfs_init from mnt_init+0xc4/0x220
   mnt_init from vfs_caches_init+0x6c/0x88
   vfs_caches_init from start_kernel+0x474/0x528
   start_kernel from 0x0

Let's rever the commit. It's undesirable to spread out raw spinlock usage
anyway and the problem can be solved by protecting the lookup path with RCU
instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: http://lkml.kernel.org/r/CAMuHMdV=AKt+mwY7svEq5gFPx41LoSQZ_USME5_MEdWQze13ww@mail.gmail.com
Link: https://lore.kernel.org/r/20240109214828.252092-2-tj@kernel.org
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit e3977e0609a07d86406029fceea0fd40d7849368)
---
 fs/kernfs/dir.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9ce7d2872b554..8b2bd65d70e72 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
  */
 static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
 static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
-static DEFINE_RAW_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
+static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
@@ -539,7 +539,6 @@ void kernfs_put(struct kernfs_node *kn)
 {
 	struct kernfs_node *parent;
 	struct kernfs_root *root;
-	unsigned long flags;
 
 	if (!kn || !atomic_dec_and_test(&kn->count))
 		return;
@@ -564,9 +563,9 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs, NULL);
 		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
-	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
+	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
+	spin_unlock(&kernfs_idr_lock);
 	kmem_cache_free(kernfs_node_cache, kn);
 
 	kn = parent;
@@ -608,7 +607,6 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	u32 id_highbits;
 	int ret;
-	unsigned long irqflags;
 
 	name = kstrdup_const(name, GFP_KERNEL);
 	if (!name)
@@ -619,13 +617,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 		goto err_out1;
 
 	idr_preload(GFP_KERNEL);
-	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
+	spin_lock(&kernfs_idr_lock);
 	ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
 	if (ret >= 0 && ret < root->last_id_lowbits)
 		root->id_highbits++;
 	id_highbits = root->id_highbits;
 	root->last_id_lowbits = ret;
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
+	spin_unlock(&kernfs_idr_lock);
 	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
@@ -661,9 +659,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	return kn;
 
  err_out3:
-	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
+	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
+	spin_unlock(&kernfs_idr_lock);
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
@@ -704,9 +702,8 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
+	spin_lock(&kernfs_idr_lock);
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -730,10 +727,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
+	spin_unlock(&kernfs_idr_lock);
 	return kn;
 err_unlock:
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
+	spin_unlock(&kernfs_idr_lock);
 	return NULL;
 }
 

From 8ace3c7f3f4b92d6ab847f244ff133951a482c7a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2024 11:45:42 -1000
Subject: [PATCH 301/304] kernfs: Rearrange kernfs_node fields to reduce its
 size on 64bit

Moving .flags and .mode right below .hash makes kernfs_node smaller by 8
bytes on 64bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kernfs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 99aaa050ccb76..03c3fb83ab9e0 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -206,6 +206,9 @@ struct kernfs_node {
 
 	const void		*ns;	/* namespace tag */
 	unsigned int		hash;	/* ns + name hash */
+	unsigned short		flags;
+	umode_t			mode;
+
 	union {
 		struct kernfs_elem_dir		dir;
 		struct kernfs_elem_symlink	symlink;
@@ -220,8 +223,6 @@ struct kernfs_node {
 	 */
 	u64			id;
 
-	unsigned short		flags;
-	umode_t			mode;
 	struct kernfs_iattrs	*iattr;
 };
 

From 9de625bde25ce3a10fe8413e34621d084ebfbd2e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2024 11:45:42 -1000
Subject: [PATCH 302/304] kernfs: RCU protect kernfs_nodes and avoid
 kernfs_idr_lock in kernfs_find_and_get_node_by_id()

The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.

Consider the following BPF program:

  SEC("fentry/__set_cpus_allowed_ptr_locked")
  int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
	       struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
  {
	  struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);

	  if (cgrp) {
		  bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
		  bpf_cgroup_release(cgrp);
	  }
	  return 0;
  }

__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:

  =====================================================
  WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
  6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
  -----------------------------------------------------
  repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
  ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70

		and this task is already holding:
  ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
  which would create a new lock dependency:
   (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
  ...
   Possible interrupt unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(kernfs_idr_lock);
				 local_irq_disable();
				 lock(&rq->__lock);
				 lock(kernfs_idr_lock);
    <Interrupt>
      lock(&rq->__lock);

		 *** DEADLOCK ***
  ...
  Call Trace:
   dump_stack_lvl+0x55/0x70
   dump_stack+0x10/0x20
   __lock_acquire+0x781/0x2a40
   lock_acquire+0xbf/0x1f0
   _raw_spin_lock+0x2f/0x40
   kernfs_find_and_get_node_by_id+0x1e/0x70
   cgroup_get_from_id+0x21/0x240
   bpf_cgroup_from_id+0xe/0x20
   bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
   bpf_trampoline_6442545632+0x4f/0x1000
   __set_cpus_allowed_ptr_locked+0x5/0x5a0
   sched_setaffinity+0x1b3/0x290
   __x64_sys_sched_setaffinity+0x4f/0x60
   do_syscall_64+0x40/0xe0
   entry_SYSCALL_64_after_hwframe+0x46/0x4e

Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.

This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
---
 fs/kernfs/dir.c             | 31 ++++++++++++++++++++-----------
 fs/kernfs/kernfs-internal.h |  2 ++
 include/linux/kernfs.h      |  2 ++
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e72..b03bb91af24fb 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_get);
 
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+	struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+	kfree_const(kn->name);
+
+	if (kn->iattr) {
+		simple_xattrs_free(&kn->iattr->xattrs, NULL);
+		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+	}
+
+	kmem_cache_free(kernfs_node_cache, kn);
+}
+
 /**
  * kernfs_put - put a reference count on a kernfs_node
  * @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
 
-	kfree_const(kn->name);
-
-	if (kn->iattr) {
-		simple_xattrs_free(&kn->iattr->xattrs, NULL);
-		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
-	}
 	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
 	spin_unlock(&kernfs_idr_lock);
-	kmem_cache_free(kernfs_node_cache, kn);
+
+	call_rcu(&kn->rcu, kernfs_free_rcu);
 
 	kn = parent;
 	if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
 	} else {
 		/* just released the root kn, free @root too */
 		idr_destroy(&root->ino_idr);
-		kfree(root);
+		kfree_rcu(root, rcu);
 	}
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
@@ -703,7 +712,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
 
-	spin_lock(&kernfs_idr_lock);
+	rcu_read_lock();
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -727,10 +736,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	spin_unlock(&kernfs_idr_lock);
+	rcu_read_unlock();
 	return kn;
 err_unlock:
-	spin_unlock(&kernfs_idr_lock);
+	rcu_read_unlock();
 	return NULL;
 }
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 237f2764b9412..b42ee6547cdc1 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ struct kernfs_root {
 	struct rw_semaphore	kernfs_rwsem;
 	struct rw_semaphore	kernfs_iattr_rwsem;
 	struct rw_semaphore	kernfs_supers_rwsem;
+
+	struct rcu_head		rcu;
 };
 
 /* +1 to avoid triggering overflow warning when negating it */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 03c3fb83ab9e0..05dcbae7ecbf2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -224,6 +224,8 @@ struct kernfs_node {
 	u64			id;
 
 	struct kernfs_iattrs	*iattr;
+
+	struct rcu_head		rcu;
 };
 
 /*

From fa75b1fa5ce2e3f5cbe29416baa60e72ae46e2db Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Fri, 26 Jan 2024 22:50:40 +0100
Subject: [PATCH 303/304] scx: fix NULL pointer dereference with scx_exit_info

Trying to load multiple schedulers at the same time can trigger the
following NULL pointer dereference:

[22053.942960] BUG: kernel NULL pointer dereference, address: 0000000000000000
[22053.942966] #PF: supervisor write access in kernel mode
[22053.942968] #PF: error_code(0x0002) - not-present page
[22053.942969] PGD 0 P4D 0
[22053.942972] Oops: 0002 [#1] PREEMPT SMP NOPTI
[22053.942976] CPU: 6 PID: 3550 Comm: sched_ext_ops_h Tainted: G           O       6.7.0-4-generic #4+scx3-Ubuntu
[22053.942978] Hardware name: GPD G1621-02/G1621-02, BIOS 2.04 09/01/2022
[22053.942980] Sched_ext: central (enabled+all)
[22053.942981] RIP: 0010:scx_ops_disable_workfn+0x85/0x610
[22053.942987] Code: 89 df f3 48 ab b9 01 00 00 00 83 fa 01 0f 86 c3 00 00 00 89 d0 f0 0f b1 0d 60 ff 11 02 0f 85 10 05 00 00 48 8b 85 90 fe ff ff <89> 10 81 fa 00 04 00 00 0f 84 ef 04 00 00 0f 8e cb 00 00 00 48 c7
[22053.942989] RSP: 0018:ffffad420257fd30 EFLAGS: 00010246
[22053.942991] RAX: 0000000000000000 RBX: ffffad420257fd58 RCX: 0000000000000001
[22053.942993] RDX: 0000000000000040 RSI: 0000000000000000 RDI: ffffad420257fd98
[22053.942994] RBP: ffffad420257fea8 R08: 0000000000000000 R09: 0000000000000000
[22053.942996] R10: 0000000000000000 R11: 0000000000000000 R12: ffffad420257fd98
[22053.942998] R13: ffff96a4d07eddc0 R14: ffff96a4d07eddc4 R15: ffffffff8897c3b0
[22053.942999] FS:  0000000000000000(0000) GS:ffff96a7dfb00000(0000) knlGS:0000000000000000
[22053.943002] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22053.943003] CR2: 0000000000000000 CR3: 00000001f6a3c001 CR4: 0000000000f70ef0
[22053.943005] PKRU: 55555554
[22053.943006] Call Trace:
[22053.943008]  <TASK>
[22053.943012]  ? show_regs+0x6d/0x80
[22053.943016]  ? __die+0x24/0x80
[22053.943018]  ? page_fault_oops+0x99/0x1b0
[22053.943021]  ? do_user_addr_fault+0x2ee/0x6b0
[22053.943024]  ? exc_page_fault+0x83/0x1b0
[22053.943028]  ? asm_exc_page_fault+0x27/0x30
[22053.943030]  ? __pfx_scx_ops_disable_workfn+0x10/0x10
[22053.943034]  ? scx_ops_disable_workfn+0x85/0x610
[22053.943036]  ? asm_sysvec_irq_work+0x1b/0x20
[22053.943042]  ? __pfx_scx_ops_disable_workfn+0x10/0x10
[22053.943043]  kthread_worker_fn+0x9e/0x230
[22053.943048]  ? __pfx_kthread_worker_fn+0x10/0x10
[22053.943050]  kthread+0xef/0x120
[22053.943053]  ? __pfx_kthread+0x10/0x10
[22053.943056]  ret_from_fork+0x44/0x70
[22053.943058]  ? __pfx_kthread+0x10/0x10
[22053.943061]  ret_from_fork_asm+0x1b/0x30
[22053.943064]  </TASK>

This happens because in scx_ops_enable(), if a scheduler is already
running, we are freeing scx_exit_info, that is still owned by the
running scheduler.

Therefore, as soon as we stop the running scheduler we can hit the NULL
pointer dereference.

Reproducer:
 - start any scheduler
 - try to start another scheduler
 - stop the running scheduler
 - BUG

Fix this by not freeing scx_exit_info in error path of scx_ops_enable()
when there is a running scheduler.

Fixes: 26ae1b03568b ("scx: Make scx_exit_info fields dynamically allocated")
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 kernel/sched/ext.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e45a3058d2cff..785da129de46b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3614,13 +3614,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			   scx_create_rt_helper("sched_ext_ops_helper"));
 		if (!scx_ops_helper) {
 			ret = -ENOMEM;
-			goto err;
+			goto err_unlock;
 		}
 	}
 
 	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
 		ret = -EBUSY;
-		goto err;
+		goto err_unlock;
 	}
 
 	scx_exit_info = alloc_exit_info();
@@ -3868,6 +3868,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		free_exit_info(scx_exit_info);
 		scx_exit_info = NULL;
 	}
+err_unlock:
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 

From e4fafc78f3f9dfa9ff61cd5b13d0f831d56228aa Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 29 Jan 2024 12:33:43 -0600
Subject: [PATCH 304/304] v6.7.1-scx1

Initial release for the stable 6.7.1 release. This is being done for
guix. Such releases will be done on an ad-hoc and as-requested basis.

Signed-off-by: David Vernet <void@manifault.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index cfef29ccc6ca0..e0b08032fe672 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 7
 SUBLEVEL = 1
-EXTRAVERSION =
+EXTRAVERSION = -scx1
 NAME = Hurr durr I'ma ninja sloth
 
 # *DOCUMENTATION*