From 26ae1b03568b1e8a8134dd7e58c42ef9b8a5b9e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2024 21:02:02 -1000
Subject: [PATCH 1/7] scx: Make scx_exit_info fields dynamically allocated

scx_exit_info currently embeds all message buffers. This isn't great in that
it makes the size of the structs a part of the ABI and wastes memory when
scx is not in use. As the contents are accessed with
bpf_probe_read_kernel_str(), the buffers can be moved outside the struct.

This change requires the scx scheduler to be rebuilt but doesn't require
code changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 17 ++++-----
 kernel/sched/ext.c        | 80 +++++++++++++++++++++++++++++----------
 2 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index ae552129931a9..f4870bd5cd073 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -18,9 +18,6 @@ struct cgroup;
 
 enum scx_consts {
 	SCX_OPS_NAME_LEN	= 128,
-	SCX_EXIT_REASON_LEN	= 128,
-	SCX_EXIT_BT_LEN		= 64,
-	SCX_EXIT_MSG_LEN	= 1024,
 
 	SCX_SLICE_DFL		= 20 * NSEC_PER_MSEC,
 	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
@@ -74,14 +71,16 @@ enum scx_exit_kind {
 struct scx_exit_info {
 	/* %SCX_EXIT_* - broad category of the exit reason */
 	enum scx_exit_kind	kind;
+
 	/* textual representation of the above */
-	char			reason[SCX_EXIT_REASON_LEN];
-	/* number of entries in the backtrace */
-	u32			bt_len;
+	const char		*reason;
+
 	/* backtrace if exiting due to an error */
-	unsigned long		bt[SCX_EXIT_BT_LEN];
-	/* extra message */
-	char			msg[SCX_EXIT_MSG_LEN];
+	unsigned long		*bt;
+	u32			bt_len;
+
+	/* informational message */
+	char			*msg;
 };
 
 /* sched_ext_ops.flags */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 52457f65aa572..4373dac429ea3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -18,6 +18,9 @@ enum scx_internal_consts {
 	SCX_DSP_DFL_MAX_BATCH		= 32,
 	SCX_DSP_MAX_LOOPS		= 32,
 	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
+
+	SCX_EXIT_BT_LEN			= 64,
+	SCX_EXIT_MSG_LEN		= 1024,
 };
 
 enum scx_ops_enable_state {
@@ -113,7 +116,7 @@ struct static_key_false scx_has_op[SCX_OPI_END] =
 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 
 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info scx_exit_info;
+static struct scx_exit_info *scx_exit_info;
 
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 
@@ -3207,14 +3210,39 @@ static void scx_ops_bypass(bool bypass)
 	}
 }
 
+static void free_exit_info(struct scx_exit_info *ei)
+{
+	kfree(ei->msg);
+	kfree(ei->bt);
+	kfree(ei);
+}
+
+static struct scx_exit_info *alloc_exit_info(void)
+{
+	struct scx_exit_info *ei;
+
+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	if (!ei)
+		return NULL;
+
+	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
+	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+
+	if (!ei->bt || !ei->msg) {
+		free_exit_info(ei);
+		return NULL;
+	}
+
+	return ei;
+}
+
 static void scx_ops_disable_workfn(struct kthread_work *work)
 {
-	struct scx_exit_info *ei = &scx_exit_info;
+	struct scx_exit_info *ei = scx_exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
-	const char *reason;
 	int i, kind;
 
 	kind = atomic_read(&scx_exit_kind);
@@ -3229,32 +3257,30 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
+	ei->kind = kind;
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
-	switch (kind) {
+	switch (ei->kind) {
 	case SCX_EXIT_UNREG:
-		reason = "BPF scheduler unregistered";
+		ei->reason = "BPF scheduler unregistered";
 		break;
 	case SCX_EXIT_SYSRQ:
-		reason = "disabled by sysrq-S";
+		ei->reason = "disabled by sysrq-S";
 		break;
 	case SCX_EXIT_ERROR:
-		reason = "runtime error";
+		ei->reason = "runtime error";
 		break;
 	case SCX_EXIT_ERROR_BPF:
-		reason = "scx_bpf_error";
+		ei->reason = "scx_bpf_error";
 		break;
 	case SCX_EXIT_ERROR_STALL:
-		reason = "runnable task stall";
+		ei->reason = "runnable task stall";
 		break;
 	default:
-		reason = "<UNKNOWN>";
+		ei->reason = "<UNKNOWN>";
 	}
 
-	ei->kind = kind;
-	strlcpy(ei->reason, reason, sizeof(ei->reason));
-
 	/* guarantee forward progress by bypassing scx_ops */
 	scx_ops_bypass(true);
 
@@ -3264,7 +3290,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		break;
 	case SCX_OPS_DISABLED:
 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-			scx_exit_info.msg);
+			scx_exit_info->msg);
 		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
 			     SCX_OPS_DISABLING);
 		goto done;
@@ -3365,6 +3391,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_dsp_buf = NULL;
 	scx_dsp_max_batch = 0;
 
+	free_exit_info(scx_exit_info);
+	scx_exit_info = NULL;
+
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
@@ -3411,17 +3440,17 @@ static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
 __printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
 				       const char *fmt, ...)
 {
-	struct scx_exit_info *ei = &scx_exit_info;
+	struct scx_exit_info *ei = scx_exit_info;
 	int none = SCX_EXIT_NONE;
 	va_list args;
 
 	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
 		return;
 
-	ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1);
+	ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
 
 	va_start(args, fmt);
-	vscnprintf(ei->msg, ARRAY_SIZE(ei->msg), fmt, args);
+	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);
 
 	irq_work_queue(&scx_ops_error_irq_work);
@@ -3474,6 +3503,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err;
 	}
 
+	scx_exit_info = alloc_exit_info();
+	if (!scx_exit_info) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
 	if (!scx_root_kobj) {
 		ret = -ENOMEM;
@@ -3494,7 +3529,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
 		     SCX_OPS_DISABLED);
 
-	memset(&scx_exit_info, 0, sizeof(scx_exit_info));
 	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
 	scx_warned_zero_slice = false;
 
@@ -3706,8 +3740,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	return 0;
 
 err:
-	kfree(scx_root_kobj);
-	scx_root_kobj = NULL;
+	if (scx_root_kobj) {
+		kfree(scx_root_kobj);
+		scx_root_kobj = NULL;
+	}
+	if (scx_exit_info) {
+		free_exit_info(scx_exit_info);
+		scx_exit_info = NULL;
+	}
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 

From 47ae2067390ed42978e7c2dbd6016d31abd738ac Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2024 21:02:02 -1000
Subject: [PATCH 2/7] scx: Misc scx_exit_info related updates

- jiffies delta -> msecs conversion wasn't quite correct. Fix it.

- Factor out scx_exit_kind -> reason string mapping.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 51 ++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4373dac429ea3..76dd1a097259b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -213,6 +213,14 @@ struct scx_task_iter {
 
 #define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
 
+static long jiffies_delta_msecs(unsigned long at, unsigned long now)
+{
+	if (time_after(at, now))
+		return jiffies_to_msecs(at - now);
+	else
+		return -(long)jiffies_to_msecs(now - at);
+}
+
 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */
 static u32 higher_bits(u32 flags)
 {
@@ -3236,6 +3244,24 @@ static struct scx_exit_info *alloc_exit_info(void)
 	return ei;
 }
 
+static const char *scx_exit_reason(enum scx_exit_kind kind)
+{
+	switch (kind) {
+	case SCX_EXIT_UNREG:
+		return "BPF scheduler unregistered";
+	case SCX_EXIT_SYSRQ:
+		return "disabled by sysrq-S";
+	case SCX_EXIT_ERROR:
+		return "runtime error";
+	case SCX_EXIT_ERROR_BPF:
+		return "scx_bpf_error";
+	case SCX_EXIT_ERROR_STALL:
+		return "runnable task stall";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
 static void scx_ops_disable_workfn(struct kthread_work *work)
 {
 	struct scx_exit_info *ei = scx_exit_info;
@@ -3258,29 +3284,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 			break;
 	}
 	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
-	switch (ei->kind) {
-	case SCX_EXIT_UNREG:
-		ei->reason = "BPF scheduler unregistered";
-		break;
-	case SCX_EXIT_SYSRQ:
-		ei->reason = "disabled by sysrq-S";
-		break;
-	case SCX_EXIT_ERROR:
-		ei->reason = "runtime error";
-		break;
-	case SCX_EXIT_ERROR_BPF:
-		ei->reason = "scx_bpf_error";
-		break;
-	case SCX_EXIT_ERROR_STALL:
-		ei->reason = "runnable task stall";
-		break;
-	default:
-		ei->reason = "<UNKNOWN>";
-	}
-
 	/* guarantee forward progress by bypassing scx_ops */
 	scx_ops_bypass(true);
 
@@ -4115,8 +4122,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 
 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
 				      sizeof(runnable_at)))
-		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms",
-			  (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC));
+		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
+			  jiffies_delta_msecs(runnable_at, jiffies));
 
 	/* print everything onto one line to conserve console space */
 	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",

From 986e8e7092abff706c769341acee8922a48f2a92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2024 21:02:02 -1000
Subject: [PATCH 3/7] scx: Dump debug info after an abort

When a scx scheduler gets aborted, it's difficult to tell what the system
was doing after the fact as normal operation is restored by reverting to the
default scheduler. Let's capture runqueue and runnable task states in a
debug dump buffer to aid debugging.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h   |   3 +
 kernel/sched/build_policy.c |   1 +
 kernel/sched/ext.c          | 110 +++++++++++++++++++++++++++++++++++-
 3 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index f4870bd5cd073..6d30ed942650f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -81,6 +81,9 @@ struct scx_exit_info {
 
 	/* informational message */
 	char			*msg;
+
+	/* debug dump */
+	char			*dump;
 };
 
 /* sched_ext_ops.flags */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 392c91667767d..e0e73b44afe98 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -25,6 +25,7 @@
 #include <linux/livepatch.h>
 #include <linux/pm.h>
 #include <linux/psi.h>
+#include <linux/seq_buf.h>
 #include <linux/seqlock_api.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 76dd1a097259b..1d5551c41614a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -21,6 +21,7 @@ enum scx_internal_consts {
 
 	SCX_EXIT_BT_LEN			= 64,
 	SCX_EXIT_MSG_LEN		= 1024,
+	SCX_EXIT_DUMP_LEN		= 32768,
 };
 
 enum scx_ops_enable_state {
@@ -3220,6 +3221,7 @@ static void scx_ops_bypass(bool bypass)
 
 static void free_exit_info(struct scx_exit_info *ei)
 {
+	kfree(ei->dump);
 	kfree(ei->msg);
 	kfree(ei->bt);
 	kfree(ei);
@@ -3235,8 +3237,9 @@ static struct scx_exit_info *alloc_exit_info(void)
 
 	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+	ei->dump = kzalloc(SCX_EXIT_DUMP_LEN, GFP_KERNEL);
 
-	if (!ei->bt || !ei->msg) {
+	if (!ei->bt || !ei->msg || !ei->dump) {
 		free_exit_info(ei);
 		return NULL;
 	}
@@ -3437,8 +3440,106 @@ static void scx_ops_disable(enum scx_exit_kind kind)
 	schedule_scx_ops_disable_work();
 }
 
+static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker,
+			  unsigned long now)
+{
+	static unsigned long bt[SCX_EXIT_BT_LEN];
+	char dsq_id_buf[19] = "(n/a)";
+	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+	unsigned int bt_len;
+	size_t avail, used;
+	char *buf;
+
+	if (p->scx.dsq)
+		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+			  (unsigned long long)p->scx.dsq->id);
+
+	seq_buf_printf(s, "\n %c%c %-16s: pid=%d state/flags=%u/0x%x dsq_flags=0x%x\n",
+		       marker, task_state_to_char(p), p->comm, p->pid,
+		       scx_get_task_state(p),
+		       p->scx.flags & ~SCX_TASK_STATE_MASK,
+		       p->scx.dsq_flags);
+	seq_buf_printf(s, "%*sops_state/qseq=%lu/%lu run_at=%+ldms\n", 22, "",
+		       ops_state & SCX_OPSS_STATE_MASK,
+		       ops_state >> SCX_OPSS_QSEQ_SHIFT,
+		       jiffies_delta_msecs(p->scx.runnable_at, now));
+	seq_buf_printf(s, "%*sdsq_id=%s sticky/holding_cpu=%d/%d\n", 22, "",
+		       dsq_id_buf, p->scx.sticky_cpu, p->scx.holding_cpu);
+
+	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+
+	avail = seq_buf_get_buf(s, &buf);
+	used = stack_trace_snprint(buf, avail, bt, bt_len, 3);
+	seq_buf_commit(s, used < avail ? used : -1);
+}
+
+static void scx_dump_state(struct scx_exit_info *ei)
+{
+	const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+	unsigned long now = jiffies;
+	struct seq_buf s;
+	size_t avail, used;
+	char *buf;
+	int cpu;
+
+	seq_buf_init(&s, ei->dump, SCX_EXIT_DUMP_LEN - sizeof(trunc_marker));
+
+	seq_buf_printf(&s, "%s[%d] triggered exit kind %d:\n  %s (%s)\n\n",
+		       current->comm, current->pid, ei->kind, ei->reason, ei->msg);
+	seq_buf_printf(&s, "Backtrace:\n");
+	avail = seq_buf_get_buf(&s, &buf);
+	used = stack_trace_snprint(buf, avail, ei->bt, ei->bt_len, 1);
+	seq_buf_commit(&s, used < avail ? used : -1);
+
+	seq_buf_printf(&s, "\nRunqueue states\n");
+	seq_buf_printf(&s, "---------------\n");
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct rq_flags rf;
+		struct task_struct *p;
+
+		rq_lock(rq, &rf);
+
+		if (list_empty(&rq->scx.runnable_list) &&
+		    rq->curr->sched_class == &idle_sched_class)
+			goto next;
+
+		seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu\n",
+			       cpu, rq->scx.nr_running, rq->scx.flags,
+			       rq->scx.cpu_released, rq->scx.ops_qseq,
+			       rq->scx.pnt_seq);
+		seq_buf_printf(&s, "          curr=%s[%d] class=%ps\n",
+			       rq->curr->comm, rq->curr->pid,
+			       rq->curr->sched_class);
+		if (!cpumask_empty(rq->scx.cpus_to_kick))
+			seq_buf_printf(&s, "  cpus_to_kick   : %*pb\n",
+				       cpumask_pr_args(rq->scx.cpus_to_kick));
+		if (!cpumask_empty(rq->scx.cpus_to_preempt))
+			seq_buf_printf(&s, "  cpus_to_preempt: %*pb\n",
+				       cpumask_pr_args(rq->scx.cpus_to_preempt));
+		if (!cpumask_empty(rq->scx.cpus_to_wait))
+			seq_buf_printf(&s, "  cpus_to_wait   : %*pb\n",
+				       cpumask_pr_args(rq->scx.cpus_to_wait));
+
+		if (rq->curr->sched_class == &ext_sched_class)
+			scx_dump_task(&s, rq->curr, '*', now);
+
+		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+			scx_dump_task(&s, p, ' ', now);
+	next:
+		rq_unlock(rq, &rf);
+	}
+
+	if (seq_buf_has_overflowed(&s)) {
+		used = strlen(seq_buf_str(&s));
+		memcpy(ei->dump + used, trunc_marker, sizeof(trunc_marker));
+	}
+}
+
 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 {
+	scx_dump_state(scx_exit_info);
 	schedule_scx_ops_disable_work();
 }
 
@@ -3460,6 +3561,13 @@ __printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);
 
+	/*
+	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+	 * in scx_ops_disable_workfn().
+	 */
+	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
+
 	irq_work_queue(&scx_ops_error_irq_work);
 }
 

From 5046ce874ad20b3230028be71fbb6632895ce4a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 23 Jan 2024 07:59:33 -1000
Subject: [PATCH 4/7] scx: rq should be locked when calling scx_ops_exit_task()
 from scx_cancel_fork()

scx_cancel_fork() was calling scx_ops_exit_task() without acquring rq lock.
Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1d5551c41614a..e45a3058d2cff 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2524,8 +2524,13 @@ void scx_post_fork(struct task_struct *p)
 void scx_cancel_fork(struct task_struct *p)
 {
 	if (scx_enabled()) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		rq = task_rq_lock(p, &rf);
 		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
 		scx_ops_exit_task(p);
+		task_rq_unlock(rq, p, &rf);
 	}
 	percpu_up_read(&scx_fork_rwsem);
 }

From 7df004e27bf33c1f0b16124f3114a42526170175 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2024 11:48:02 -1000
Subject: [PATCH 5/7] Revert "kernfs: convert kernfs_idr_lock to an irq safe
 raw spinlock"

This reverts commit dad3fb67ca1cbef87ce700e83a55835e5921ce8a.

The commit converted kernfs_idr_lock to an IRQ-safe raw_spinlock because it
could be acquired while holding an rq lock through bpf_cgroup_from_id().
However, kernfs_idr_lock is held while doing GPF_NOWAIT allocations which
involves acquiring an non-IRQ-safe and non-raw lock leading to the following
lockdep warning:

  =============================
  [ BUG: Invalid wait context ]
  6.7.0-rc5-kzm9g-00251-g655022a45b1c #578 Not tainted
  -----------------------------
  swapper/0/0 is trying to lock:
  dfbcd488 (&c->lock){....}-{3:3}, at: local_lock_acquire+0x0/0xa4
  other info that might help us debug this:
  context-{5:5}
  2 locks held by swapper/0/0:
   #0: dfbc9c60 (lock){+.+.}-{3:3}, at: local_lock_acquire+0x0/0xa4
   #1: c0c012a8 (kernfs_idr_lock){....}-{2:2}, at: __kernfs_new_node.constprop.0+0x68/0x258
  stack backtrace:
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.7.0-rc5-kzm9g-00251-g655022a45b1c #578
  Hardware name: Generic SH73A0 (Flattened Device Tree)
   unwind_backtrace from show_stack+0x10/0x14
   show_stack from dump_stack_lvl+0x68/0x90
   dump_stack_lvl from __lock_acquire+0x3cc/0x168c
   __lock_acquire from lock_acquire+0x274/0x30c
   lock_acquire from local_lock_acquire+0x28/0xa4
   local_lock_acquire from ___slab_alloc+0x234/0x8a8
   ___slab_alloc from __slab_alloc.constprop.0+0x30/0x44
   __slab_alloc.constprop.0 from kmem_cache_alloc+0x7c/0x148
   kmem_cache_alloc from radix_tree_node_alloc.constprop.0+0x44/0xdc
   radix_tree_node_alloc.constprop.0 from idr_get_free+0x110/0x2b8
   idr_get_free from idr_alloc_u32+0x9c/0x108
   idr_alloc_u32 from idr_alloc_cyclic+0x50/0xb8
   idr_alloc_cyclic from __kernfs_new_node.constprop.0+0x88/0x258
   __kernfs_new_node.constprop.0 from kernfs_create_root+0xbc/0x154
   kernfs_create_root from sysfs_init+0x18/0x5c
   sysfs_init from mnt_init+0xc4/0x220
   mnt_init from vfs_caches_init+0x6c/0x88
   vfs_caches_init from start_kernel+0x474/0x528
   start_kernel from 0x0

Let's rever the commit. It's undesirable to spread out raw spinlock usage
anyway and the problem can be solved by protecting the lookup path with RCU
instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: http://lkml.kernel.org/r/CAMuHMdV=AKt+mwY7svEq5gFPx41LoSQZ_USME5_MEdWQze13ww@mail.gmail.com
Link: https://lore.kernel.org/r/20240109214828.252092-2-tj@kernel.org
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit e3977e0609a07d86406029fceea0fd40d7849368)
---
 fs/kernfs/dir.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9ce7d2872b554..8b2bd65d70e72 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
  */
 static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
 static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
-static DEFINE_RAW_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
+static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
@@ -539,7 +539,6 @@ void kernfs_put(struct kernfs_node *kn)
 {
 	struct kernfs_node *parent;
 	struct kernfs_root *root;
-	unsigned long flags;
 
 	if (!kn || !atomic_dec_and_test(&kn->count))
 		return;
@@ -564,9 +563,9 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs, NULL);
 		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
-	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
+	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
+	spin_unlock(&kernfs_idr_lock);
 	kmem_cache_free(kernfs_node_cache, kn);
 
 	kn = parent;
@@ -608,7 +607,6 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	u32 id_highbits;
 	int ret;
-	unsigned long irqflags;
 
 	name = kstrdup_const(name, GFP_KERNEL);
 	if (!name)
@@ -619,13 +617,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 		goto err_out1;
 
 	idr_preload(GFP_KERNEL);
-	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
+	spin_lock(&kernfs_idr_lock);
 	ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
 	if (ret >= 0 && ret < root->last_id_lowbits)
 		root->id_highbits++;
 	id_highbits = root->id_highbits;
 	root->last_id_lowbits = ret;
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
+	spin_unlock(&kernfs_idr_lock);
 	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
@@ -661,9 +659,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	return kn;
 
  err_out3:
-	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
+	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
+	spin_unlock(&kernfs_idr_lock);
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
@@ -704,9 +702,8 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
+	spin_lock(&kernfs_idr_lock);
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -730,10 +727,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
+	spin_unlock(&kernfs_idr_lock);
 	return kn;
 err_unlock:
-	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
+	spin_unlock(&kernfs_idr_lock);
 	return NULL;
 }
 

From 8ace3c7f3f4b92d6ab847f244ff133951a482c7a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2024 11:45:42 -1000
Subject: [PATCH 6/7] kernfs: Rearrange kernfs_node fields to reduce its size
 on 64bit

Moving .flags and .mode right below .hash makes kernfs_node smaller by 8
bytes on 64bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kernfs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 99aaa050ccb76..03c3fb83ab9e0 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -206,6 +206,9 @@ struct kernfs_node {
 
 	const void		*ns;	/* namespace tag */
 	unsigned int		hash;	/* ns + name hash */
+	unsigned short		flags;
+	umode_t			mode;
+
 	union {
 		struct kernfs_elem_dir		dir;
 		struct kernfs_elem_symlink	symlink;
@@ -220,8 +223,6 @@ struct kernfs_node {
 	 */
 	u64			id;
 
-	unsigned short		flags;
-	umode_t			mode;
 	struct kernfs_iattrs	*iattr;
 };
 

From 9de625bde25ce3a10fe8413e34621d084ebfbd2e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2024 11:45:42 -1000
Subject: [PATCH 7/7] kernfs: RCU protect kernfs_nodes and avoid
 kernfs_idr_lock in kernfs_find_and_get_node_by_id()

The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.

Consider the following BPF program:

  SEC("fentry/__set_cpus_allowed_ptr_locked")
  int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
	       struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
  {
	  struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);

	  if (cgrp) {
		  bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
		  bpf_cgroup_release(cgrp);
	  }
	  return 0;
  }

__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:

  =====================================================
  WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
  6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
  -----------------------------------------------------
  repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
  ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70

		and this task is already holding:
  ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
  which would create a new lock dependency:
   (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
  ...
   Possible interrupt unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(kernfs_idr_lock);
				 local_irq_disable();
				 lock(&rq->__lock);
				 lock(kernfs_idr_lock);
    <Interrupt>
      lock(&rq->__lock);

		 *** DEADLOCK ***
  ...
  Call Trace:
   dump_stack_lvl+0x55/0x70
   dump_stack+0x10/0x20
   __lock_acquire+0x781/0x2a40
   lock_acquire+0xbf/0x1f0
   _raw_spin_lock+0x2f/0x40
   kernfs_find_and_get_node_by_id+0x1e/0x70
   cgroup_get_from_id+0x21/0x240
   bpf_cgroup_from_id+0xe/0x20
   bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
   bpf_trampoline_6442545632+0x4f/0x1000
   __set_cpus_allowed_ptr_locked+0x5/0x5a0
   sched_setaffinity+0x1b3/0x290
   __x64_sys_sched_setaffinity+0x4f/0x60
   do_syscall_64+0x40/0xe0
   entry_SYSCALL_64_after_hwframe+0x46/0x4e

Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.

This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
---
 fs/kernfs/dir.c             | 31 ++++++++++++++++++++-----------
 fs/kernfs/kernfs-internal.h |  2 ++
 include/linux/kernfs.h      |  2 ++
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e72..b03bb91af24fb 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_get);
 
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+	struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+	kfree_const(kn->name);
+
+	if (kn->iattr) {
+		simple_xattrs_free(&kn->iattr->xattrs, NULL);
+		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+	}
+
+	kmem_cache_free(kernfs_node_cache, kn);
+}
+
 /**
  * kernfs_put - put a reference count on a kernfs_node
  * @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
 
-	kfree_const(kn->name);
-
-	if (kn->iattr) {
-		simple_xattrs_free(&kn->iattr->xattrs, NULL);
-		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
-	}
 	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
 	spin_unlock(&kernfs_idr_lock);
-	kmem_cache_free(kernfs_node_cache, kn);
+
+	call_rcu(&kn->rcu, kernfs_free_rcu);
 
 	kn = parent;
 	if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
 	} else {
 		/* just released the root kn, free @root too */
 		idr_destroy(&root->ino_idr);
-		kfree(root);
+		kfree_rcu(root, rcu);
 	}
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
@@ -703,7 +712,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
 
-	spin_lock(&kernfs_idr_lock);
+	rcu_read_lock();
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -727,10 +736,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	spin_unlock(&kernfs_idr_lock);
+	rcu_read_unlock();
 	return kn;
 err_unlock:
-	spin_unlock(&kernfs_idr_lock);
+	rcu_read_unlock();
 	return NULL;
 }
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 237f2764b9412..b42ee6547cdc1 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ struct kernfs_root {
 	struct rw_semaphore	kernfs_rwsem;
 	struct rw_semaphore	kernfs_iattr_rwsem;
 	struct rw_semaphore	kernfs_supers_rwsem;
+
+	struct rcu_head		rcu;
 };
 
 /* +1 to avoid triggering overflow warning when negating it */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 03c3fb83ab9e0..05dcbae7ecbf2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -224,6 +224,8 @@ struct kernfs_node {
 	u64			id;
 
 	struct kernfs_iattrs	*iattr;
+
+	struct rcu_head		rcu;
 };
 
 /*