Merge branch 'sched-ext:main' into main

sched-ext · Sep 12, 2024 · a5ab930 · a5ab930
2 parents d35b596 + 632fcfe
commit a5ab930
Show file tree

Hide file tree

Showing 5 changed files with 193 additions and 7 deletions.
diff --git a/rust/scx_utils/src/topology.rs b/rust/scx_utils/src/topology.rs
@@ -166,6 +166,8 @@ impl Cpu {
 #[derive(Debug, Clone)]
 pub struct Core {
     id: usize,
+    pub node_id: usize,
+    pub llc_id: usize,
     cpus: BTreeMap<usize, Cpu>,
     span: Cpumask,
     pub core_type: CoreType,
@@ -525,6 +527,8 @@ fn create_insert_cpu(
 
     let core = cache.cores.entry(core_id).or_insert(Core {
         id: core_id,
+        llc_id: llc_id,
+        node_id: node.id,
         cpus: BTreeMap::new(),
         span: Cpumask::new()?,
         core_type: core_type.clone(),

diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h
@@ -71,6 +71,8 @@ enum layer_stat_idx {
 	LSTAT_YIELD,
 	LSTAT_YIELD_IGNORE,
 	LSTAT_MIGRATION,
+	LSTAT_XNUMA_MIGRATION,
+	LSTAT_XLLC_MIGRATION,
 	NR_LSTATS,
 };
 
@@ -86,6 +88,8 @@ struct cpu_ctx {
 	u64			lstats[MAX_LAYERS][NR_LSTATS];
 	u64			ran_current_for;
 	u32			layer_idx;
+	u32			node_idx;
+	u32			cache_idx;
 };
 
 struct cache_ctx {

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -136,9 +136,6 @@ static u32 cpu_to_llc_id(s32 cpu_id)
         return *llc_ptr;
 }
 
-/*
- * Numa node context
- */
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
@@ -147,6 +144,30 @@ struct {
 	__uint(map_flags, 0);
 } node_data SEC(".maps");
 
+static struct node_ctx *lookup_node_ctx(u32 node)
+{
+	struct node_ctx *nodec;
+
+	nodec = bpf_map_lookup_elem(&node_data, &node);
+	return nodec;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cache_ctx);
+	__uint(max_entries, MAX_DOMS);
+	__uint(map_flags, 0);
+} cache_data SEC(".maps");
+
+static struct cache_ctx *lookup_cache_ctx(u32 cache_idx)
+{
+	struct cache_ctx *cachec;
+
+	cachec = bpf_map_lookup_elem(&cache_data, &cache_idx);
+	return cachec;
+}
+
 static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx)
 {
 	if (idx < 0 || idx >= NR_GSTATS) {
@@ -1166,6 +1187,7 @@ static s32 create_node(u32 node_id)
 	u32 cpu;
 	struct bpf_cpumask *cpumask;
 	struct node_ctx *nodec;
+	struct cpu_ctx *cctx;
 	s32 ret;
 
 	nodec = bpf_map_lookup_elem(&node_data, &node_id);
@@ -1198,8 +1220,58 @@ static s32 create_node(u32 node_id)
 			break;
 		}
 
-		if (*nmask & (1LLU << (cpu % 64)))
+		if (*nmask & (1LLU << (cpu % 64))) {
 			bpf_cpumask_set_cpu(cpu, cpumask);
+			if (!(cctx = lookup_cpu_ctx(-1))) {
+				scx_bpf_error("cpu ctx error");
+				ret = -ENOENT;
+				break;
+			}
+			cctx->node_idx = node_id;
+		}
+	}
+
+	bpf_rcu_read_unlock();
+	return ret;
+}
+
+static s32 create_cache(u32 cache_id)
+{
+	u32 cpu, llc_id;
+	struct bpf_cpumask *cpumask;
+	struct cache_ctx *cachec;
+	struct cpu_ctx *cctx;
+	s32 ret;
+
+	cachec = bpf_map_lookup_elem(&cache_data, &cache_id);
+	if (!cachec) {
+		scx_bpf_error("No cache%u", cache_id);
+		return -ENOENT;
+	}
+	cachec->id = cache_id;
+
+	ret = create_save_cpumask(&cachec->cpumask);
+	if (ret)
+		return ret;
+
+	bpf_rcu_read_lock();
+	cpumask = cachec->cpumask;
+	if (!cpumask) {
+		bpf_rcu_read_unlock();
+		scx_bpf_error("Failed to lookup node cpumask");
+		return -ENOENT;
+	}
+
+	bpf_for(cpu, 0, MAX_CPUS) {
+		llc_id = cpu_to_llc_id(cpu);
+		if (llc_id != cache_id)
+			continue;
+
+		bpf_cpumask_set_cpu(cpu, cpumask);
+		if (!(cctx = lookup_cpu_ctx(-1))) {
+			scx_bpf_error("cpu ctx error"); ret = -ENOENT; break;
+		}
+		cctx->cache_idx = cache_id;
 	}
 
 	bpf_rcu_read_unlock();
@@ -1225,14 +1297,27 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
 	struct layer *layer;
+	struct node_ctx *nodec;
+	struct cache_ctx *cachec;
 	s32 task_cpu = scx_bpf_task_cpu(p);
 
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
 	    !(layer = lookup_layer(tctx->layer)))
 		return;
 
-	if (tctx->last_cpu >= 0 && tctx->last_cpu != task_cpu)
+	if (tctx->last_cpu >= 0 && tctx->last_cpu != task_cpu) {
 		lstat_inc(LSTAT_MIGRATION, layer, cctx);
+		if (!(nodec = lookup_node_ctx(cctx->node_idx)))
+			return;
+		if (nodec->cpumask &&
+		    !bpf_cpumask_test_cpu(tctx->last_cpu, nodec->cpumask))
+			lstat_inc(LSTAT_XNUMA_MIGRATION, layer, cctx);
+		if (!(cachec = lookup_cache_ctx(cctx->cache_idx)))
+			return;
+		if (cachec->cpumask &&
+		    !bpf_cpumask_test_cpu(tctx->last_cpu, cachec->cpumask))
+			lstat_inc(LSTAT_XLLC_MIGRATION, layer, cctx);
+	}
 	tctx->last_cpu = task_cpu;
 
 	if (vtime_before(layer->vtime_now, p->scx.dsq_vtime))
@@ -1560,6 +1645,11 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 		if (ret)
 			return ret;
 	}
+	bpf_for(i, 0, nr_llcs) {
+		ret = create_cache(i);
+		if (ret)
+			return ret;
+	}
 
 	dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d",
 	    nr_online_cpus, smt_enabled);

diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
@@ -268,6 +268,11 @@ lazy_static::lazy_static! {
 ///
 /// - slice_us: Scheduling slice duration in microseconds.
 ///
+/// - growth_algo: When a layer is allocated new CPUs different algorithms can
+///   be used to determine which CPU should be allocated next. The default
+///   algorithm is a "sticky" algorithm that attempts to spread layers evenly
+///   across cores.
+///
 /// - perf: CPU performance target. 0 means no configuration. A value
 ///   between 1 and 1024 indicates the performance level CPUs running tasks
 ///   in this layer are configured to using scx_bpf_cpuperf_set().
@@ -450,9 +455,18 @@ enum LayerMatch {
 #[derive(Clone, Debug, Parser, Serialize, Deserialize)]
 #[clap(rename_all = "snake_case")]
 enum LayerGrowthAlgo {
+    /// Sticky attempts to place layers evenly spaced across cores.
     Sticky,
+    /// Linear starts with the lowest number CPU and grows towards the total
+    /// number of CPUs.
     Linear,
+    /// Random core selection order.
     Random,
+    /// Topo uses the order of the nodes/llcs in the layer config to determine
+    /// the order of CPUs to select when growing a layer. It starts from the
+    /// llcs configuration and then the NUMA configuration for any CPUs not
+    /// specified.
+    Topo,
 }
 
 impl Default for LayerGrowthAlgo {
@@ -559,6 +573,20 @@ impl LayerSpec {
         };
         Ok(config.specs)
     }
+    fn nodes(&self) -> Vec<usize> {
+        match &self.kind {
+            LayerKind::Confined { nodes, .. }
+            | LayerKind::Open { nodes, .. }
+            | LayerKind::Grouped { nodes, .. } => nodes.clone(),
+        }
+    }
+    fn llcs(&self) -> Vec<usize> {
+        match &self.kind {
+            LayerKind::Confined { llcs, .. }
+            | LayerKind::Open { llcs, .. }
+            | LayerKind::Grouped { llcs, .. } => llcs.clone(),
+        }
+    }
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
@@ -1093,6 +1121,7 @@ impl CpuPool {
 }
 
 fn layer_core_order(
+    cpu_pool: &CpuPool,
     spec: &LayerSpec,
     growth_algo: LayerGrowthAlgo,
     layer_idx: usize,
@@ -1141,6 +1170,48 @@ fn layer_core_order(
             fastrand::seed(layer_idx.try_into().unwrap());
             fastrand::shuffle(&mut core_order);
         }
+        LayerGrowthAlgo::Topo => {
+            let spec_nodes = spec.nodes();
+            let spec_llcs = spec.llcs();
+            let topo_nodes = topo.nodes();
+
+            if spec_nodes.len() + spec_llcs.len() == 0 {
+                // XXX: fallback to something more sane (round robin when it exists)
+                linear();
+            } else {
+                let mut core_id = 0;
+                spec_llcs.iter().for_each(|spec_llc| {
+                    core_id = 0;
+                    topo_nodes.iter().for_each(|topo_node| {
+                        topo_node.cores().values().for_each(|core| {
+                            if core.llc_id != *spec_llc {
+                                core_id += 1;
+                                return;
+                            }
+                            if !core_order.contains(&core_id) {
+                                core_order.push(core_id);
+                            }
+                            core_id += 1;
+                        });
+                    });
+                });
+                spec_nodes.iter().for_each(|spec_node| {
+                    core_id = 0;
+                    topo_nodes.iter().for_each(|topo_node| {
+                        if topo_node.id() != *spec_node {
+                            core_id += topo_node.cores().len();
+                            return;
+                        }
+                        topo_node.cores().values().for_each(|_core| {
+                            if !core_order.contains(&core_id) {
+                                core_order.push(core_id);
+                            }
+                            core_id += 1;
+                        });
+                    });
+                });
+            }
+        }
     }
     core_order
 }
@@ -1237,7 +1308,7 @@ impl Layer {
             | LayerKind::Open { growth_algo, .. } => growth_algo.clone(),
         };
 
-        let core_order = layer_core_order(spec, layer_growth_algo.clone(), idx, topo);
+        let core_order = layer_core_order(cpu_pool, spec, layer_growth_algo.clone(), idx, topo);
         debug!(
             "layer: {} algo: {:?} core order: {:?}",
             name,
@@ -1614,6 +1685,15 @@ impl<'a, 'b> Scheduler<'a, 'b> {
                 node.llcs().len()
             );
             skel.maps.rodata_data.nr_llcs += node.llcs().len() as u32;
+            let raw_numa_slice = node.span().as_raw_slice();
+            let node_cpumask_slice = &mut skel.maps.rodata_data.numa_cpumasks[node.id()];
+            let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
+            left.clone_from_slice(raw_numa_slice);
+            debug!(
+                "node {} mask: {:?}",
+                node.id(),
+                skel.maps.rodata_data.numa_cpumasks[node.id()]
+            );
 
             for (_, llc) in node.llcs() {
                 debug!("configuring llc {:?} for node {:?}", llc.id(), node.id());

diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs
@@ -103,6 +103,10 @@ pub struct LayerStats {
     pub yield_ignore: u64,
     #[stat(desc = "% migrated across CPUs")]
     pub migration: f64,
+    #[stat(desc = "% migrated across NUMA nodes")]
+    pub xnuma_migration: f64,
+    #[stat(desc = "% migrated across LLCs")]
+    pub xllc_migration: f64,
     #[stat(desc = "mask of allocated CPUs", _om_skip)]
     pub cpus: Vec<u32>,
     #[stat(desc = "# of CPUs assigned")]
@@ -188,6 +192,8 @@ impl LayerStats {
             yielded: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_YIELD),
             yield_ignore: lstat(bpf_intf::layer_stat_idx_LSTAT_YIELD_IGNORE) as u64,
             migration: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_MIGRATION),
+            xnuma_migration: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_XNUMA_MIGRATION),
+            xllc_migration: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_XLLC_MIGRATION),
             cpus: Self::bitvec_to_u32s(&layer.cpus),
             cur_nr_cpus: layer.cpus.count_ones() as u32,
             min_nr_cpus: nr_cpus_range.0 as u32,
@@ -235,10 +241,12 @@ impl LayerStats {
 
         writeln!(
             w,
-            "  {:<width$}  open_idle={} mig={} affn_viol={}",
+            "  {:<width$}  open_idle={} mig={} xnuma_mig={} xllc_mig={} affn_viol={}",
             "",
             fmt_pct(self.open_idle),
             fmt_pct(self.migration),
+            fmt_pct(self.xnuma_migration),
+            fmt_pct(self.xllc_migration),
             fmt_pct(self.affn_viol),
             width = header_width,
         )?;