Merge pull request #890 from likewhatevs/layered-dsq-timer

scx_layered: add timer antistall
sched-ext · Nov 9, 2024 · 835f0d0 · 835f0d0
2 parents 38512bf + 89f4aa1
commit 835f0d0
Show file tree

Hide file tree

Showing 4 changed files with 263 additions and 3 deletions.
diff --git a/.github/workflows/caching-build.yml b/.github/workflows/caching-build.yml
@@ -187,6 +187,7 @@ jobs:
           matrix:
             scheduler: [ scx_layered ]
             topo: ['--disable-topology=false', '--disable-topology=true']
+            antistall: ['', '--disable-antistall']
           fail-fast: false
     steps:
       # prevent cache permission errors
@@ -228,7 +229,7 @@ jobs:
         run: exit -1
 
       # The actual build:
-      - run: meson setup build -Dkernel=../linux/arch/x86/boot/bzImage -Dkernel_headers=../linux -Denable_stress=true -Dvng_rw_mount=true -Dextra_sched_args=" ${{ matrix.topo }}"
+      - run: meson setup build -Dkernel=../linux/arch/x86/boot/bzImage -Dkernel_headers=../linux -Denable_stress=true -Dvng_rw_mount=true -Dextra_sched_args=" ${{ matrix.topo }} ${{ matrix.antistall }}"
       - run: meson compile -C build ${{ matrix.scheduler }}
 
       # Print CPU model before running the tests (this can be useful for
@@ -255,7 +256,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ matrix.scheduler }}_${{ matrix.topo }}_${{ matrix.test_name }}_logs_${{ github.run_id }}_${{ github.run_attempt }}
+          name: ${{ matrix.scheduler }}_${{ matrix.topo }}_${{ matrix.test_name }}_logs_${{ github.run_id }}_${{ github.run_attempt }}_${{ matrix.antistall }}
           path: ./log_save/*.ci.log
           # it's all txt files w/ 90 day retention, lets be nice.
           compression-level: 9

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -1,6 +1,8 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates. */
 #ifdef LSP
+#ifndef __bpf__
 #define __bpf__
+#endif
 #define LSP_INC
 #include "../../../../include/scx/common.bpf.h"
 #include "../../../../include/scx/ravg_impl.bpf.h"
@@ -39,13 +41,19 @@ const volatile s32 __sibling_cpu[MAX_CPUS];
 const volatile bool monitor_disable = false;
 const volatile unsigned char all_cpus[MAX_CPUS_U8];
 const volatile u32 layer_iteration_order[MAX_LAYERS];
+/* Flag to enable or disable antistall feature */
+const volatile bool enable_antistall = true;
+/* Delay permitted, in seconds, before antistall activates */
+const volatile u64 antistall_sec = 3;
 
 private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
 private(big_cpumask) struct bpf_cpumask __kptr *big_cpumask;
 struct layer layers[MAX_LAYERS];
 u32 fallback_cpu;
 static u32 preempt_cursor;
 
+extern unsigned CONFIG_HZ __kconfig;
+
 #define dbg(fmt, args...)	do { if (debug) bpf_printk(fmt, ##args); } while (0)
 #define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
 
@@ -1233,6 +1241,107 @@ static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)
 	return false;
 }
 
+/* Mapping of cpu to most delayed DSQ it can consume */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, 1);
+} antistall_cpu_dsq SEC(".maps");
+
+/* Mapping cpu to delay of highest delayed DSQ it can consume */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, 1);
+} antistall_cpu_max_delay SEC(".maps");
+
+/**
+ * get_delay_sec() - get runnable_at delay of a task_struct in seconds.
+ * @p: task_struct *p
+ * @jiffies_now: time from which to measure delay, in jiffies.
+ *
+ * Return: runnable_at delay, if any exists, in seconds.
+ */
+int get_delay_sec(struct task_struct *p, u64 jiffies_now) 
+{
+	u64 runnable_at, delta_secs;
+	runnable_at = READ_ONCE(p->scx.runnable_at);
+
+	if (vtime_before(runnable_at, jiffies_now)) {
+		delta_secs = (jiffies_now - runnable_at) / CONFIG_HZ;
+	} else {
+		delta_secs = 0;
+	}
+
+	return delta_secs;
+}
+
+/**
+ * antistall_consume() - consume delayed DSQ
+ * @cpu: cpu number
+ * @cctx: cpu context
+ * 
+ * This function consumes a delayed DSQ. This is meant to be called
+ * from dispatch, before any other logic which could result in a 
+ * DSQ being consumed.
+ *
+ * This is meant to prevent issues such as DSQs with affinitized tasks
+ * stalling when all CPUs which can process them are continually consuming
+ * other DSQs.
+ *
+ * Return: bool indicating if a DSQ was consumed or not.
+ */
+bool antistall_consume(s32 cpu, struct cpu_ctx *cctx)
+{
+	u64 *antistall_dsq, jiffies_now, cur_delay;
+	u32 zero;
+	bool consumed;
+	struct task_struct *p;
+
+	cur_delay = 0;	
+	consumed = false;
+	zero = 0;
+
+	if (!enable_antistall)
+		return false;
+
+	if (!cctx || !cctx->layer_idx || !cpu)
+		return false;
+
+	antistall_dsq = bpf_map_lookup_elem(&antistall_cpu_dsq, &zero);
+
+	if (!antistall_dsq) {
+		scx_bpf_error("cant happen");
+		return false;
+	}
+
+	if (*antistall_dsq == SCX_DSQ_INVALID)
+		return false;
+
+	consumed = scx_bpf_consume(*antistall_dsq);
+
+	if (!consumed) 
+		goto reset;
+
+	jiffies_now = bpf_jiffies64();
+
+	bpf_for_each(scx_dsq, p, *antistall_dsq, 0) {
+		cur_delay = get_delay_sec(p, jiffies_now);
+
+		if (cur_delay > antistall_sec)
+			return consumed;
+
+		goto reset;	
+	}
+
+reset:
+	trace("antistall reset DSQ[%llu] SELECTED_CPU[%llu] DELAY[%llu]", *antistall_dsq, cpu, cur_delay);
+	*antistall_dsq = SCX_DSQ_INVALID;
+	return consumed;
+}
+
 static __noinline
 void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 {
@@ -1246,6 +1355,9 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(costc = lookup_cpu_cost(cpu)))
 		return;
 
+	if (antistall_consume(cpu, cctx))
+		return;
+
 	/*
 	 * if @prev was on SCX and is still runnable, we are here because @prev
 	 * has exhausted its slice. We may want to keep running it on this CPU
@@ -1455,6 +1567,9 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 	    !(costc = lookup_cpu_cost(cpu)))
 		return;
 
+	if (antistall_consume(cpu, cctx))
+		return;
+
 	/*
 	 * if @prev was on SCX and is still runnable, we are here because @prev
 	 * has exhausted its slice. We may want to keep running it on this CPU
@@ -2078,7 +2193,7 @@ void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
 
-	if(args->cancelled){
+	if (args->cancelled) {
 		return;
 	}
 
@@ -2258,12 +2373,133 @@ static bool layered_monitor(void)
 	return true;
 }
 
+/**
+ * antistall_set() - set antistall flags.
+ * @dsq_id: Dsq to check for delayed tasks.
+ * @jiffies_now: Time to check against in jiffies.
+ *
+ * This function sets entries in antistall_cpu_dsq used by antistall.
+ * It checks the given DSQ to see if delay exceeds antistall_sec.
+ * It tries to find a CPU satisfying the constraints of "can run the first
+ * task in the provided DSQ" and "is not already flagged for use in antistall".
+ * If it cannot find such a CPU to flag, it will try to flag a CPU flagged to
+ * process another with a lesser delay if one exists.
+ */
+u64 antistall_set(u64 dsq_id, u64 jiffies_now) 
+{
+	struct task_struct *p;
+	struct task_ctx *tctx;
+	s32 cpu;
+	u64 *antistall_dsq, *delay, cur_delay;
+	bool first_pass;
+	u32 zero;
+
+	zero = 0;
+
+	if (!dsq_id || !jiffies_now)
+		return 0;
+
+	// verifier
+	bpf_rcu_read_lock();	
+	bpf_for_each(scx_dsq, p, dsq_id, 0) {
+
+		if (!(tctx = lookup_task_ctx(p)))
+			goto unlock;
+
+		cur_delay = get_delay_sec(p, jiffies_now);
+		if (cur_delay <= antistall_sec)
+			// check head task in dsq
+			goto unlock;
+
+		first_pass = true;
+look_for_cpu:
+		bpf_for(cpu, 0, nr_possible_cpus) {
+			if (!tctx->layered_cpumask)
+				goto unlock;
+
+			if (!bpf_cpumask_test_cpu(cpu,   cast_mask(tctx->layered_cpumask)))
+				continue;
+
+			antistall_dsq = bpf_map_lookup_percpu_elem(&antistall_cpu_dsq, &zero, cpu);
+			delay = bpf_map_lookup_percpu_elem(&antistall_cpu_max_delay, &zero, cpu);
+
+			if (!antistall_dsq || !delay) {
+				scx_bpf_error("cant happen");
+				goto unlock;
+			}
+
+			if (*antistall_dsq == SCX_DSQ_INVALID) {
+				trace("antistall set DSQ[%llu] SELECTED_CPU[%llu] DELAY[%llu]", dsq_id, cpu, cur_delay);
+				*delay = cur_delay;
+				*antistall_dsq = dsq_id;
+				goto unlock;
+			}
+
+			if (first_pass)
+				continue;
+
+			if (*delay < cur_delay) {
+				trace("antistall set DSQ[%llu] SELECTED_CPU[%llu] DELAY[%llu]", dsq_id, cpu, cur_delay);
+				*delay = cur_delay;
+				*antistall_dsq = dsq_id;
+				goto unlock;
+			}
+		}		
+
+		if (first_pass) {
+			first_pass = false;
+			goto look_for_cpu;
+		}
+
+		goto unlock;
+	}
+
+unlock:
+	bpf_rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * antistall_scan() - call antistall_set on all DSQs.
+ *
+ * This function calls antistall_set on all DSQs.
+ * This is where antistall figures out what work, if any, needs
+ * to be prioritized to keep runnable_at delay at or below antistall_sec.
+ */
+static bool antistall_scan(void)
+{
+	s32 cpu;
+	u64 dsq_id;
+	u64 jiffies_now;
+
+	if (!enable_antistall)
+		return true;
+
+	jiffies_now = bpf_jiffies64();
+
+	bpf_for(dsq_id, 0, nr_layers) {
+		antistall_set(dsq_id, jiffies_now);
+	}
+
+	bpf_for(cpu, 0, nr_possible_cpus) {
+		dsq_id = cpu_hi_fallback_dsq_id(cpu);
+		antistall_set(dsq_id, jiffies_now);
+	}
+
+	antistall_set(LO_FALLBACK_DSQ, jiffies_now);
+
+	antistall_set(HI_FALLBACK_DSQ_BASE, jiffies_now);
+
+	return true;
+}
 
 static bool run_timer_cb(int key)
 {
 	switch (key) {
 	case LAYERED_MONITOR:
 		return layered_monitor();
+	case ANTISTALL_TIMER:
+		return antistall_scan();
 	case NOOP_TIMER:
 	case MAX_TIMERS:
 	default:
@@ -2273,6 +2509,7 @@ static bool run_timer_cb(int key)
 
 struct layered_timer layered_timers[MAX_TIMERS] = {
 	{15LLU * NSEC_PER_SEC, CLOCK_BOOTTIME, 0},
+	{1LLU * NSEC_PER_SEC, CLOCK_BOOTTIME, 0},
 	{0LLU, CLOCK_BOOTTIME, 0},
 };
 
@@ -2285,6 +2522,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	struct bpf_cpumask *cpumask, *tmp_big_cpumask;
 	struct cpu_ctx *cctx;
 	int i, j, k, nr_online_cpus, ret;
+	u64 *init_antistall_delay, *init_antistall_dsq;
+	u32 zero;
+
+	zero = 0;
 
 	ret = scx_bpf_create_dsq(LO_FALLBACK_DSQ, -1);
 	if (ret < 0)
@@ -2303,6 +2544,11 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	nr_online_cpus = 0;
 	bpf_for(i, 0, nr_possible_cpus) {
 		const volatile u8 *u8_ptr;
+
+	  init_antistall_dsq = bpf_map_lookup_percpu_elem(&antistall_cpu_dsq, &zero, i);
+		if (init_antistall_dsq) {
+			*init_antistall_dsq = SCX_DSQ_INVALID;
+		}
 
 		if (!(cctx = lookup_cpu_ctx(i))) {
 			bpf_cpumask_release(cpumask);

diff --git a/scheds/rust/scx_layered/src/bpf/timer.bpf.h b/scheds/rust/scx_layered/src/bpf/timer.bpf.h
@@ -25,6 +25,7 @@ struct layered_timer {
 
 enum layer_timer_callbacks {
 	LAYERED_MONITOR,
+	ANTISTALL_TIMER,
 	NOOP_TIMER,
 	MAX_TIMERS,
 };

diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
@@ -458,6 +458,14 @@ struct Opts {
     #[clap(long)]
     run_example: bool,
 
+    /// Disable antistall
+    #[clap(long, default_value = "false")]
+    disable_antistall: bool,
+
+    /// Maximum task runnable_at delay (in seconds) before antistall turns on
+    #[clap(long, default_value = "3")]
+    antistall_sec: u64,
+
     /// Show descriptions for statistics.
     #[clap(long)]
     help_stats: bool,
@@ -1486,9 +1494,13 @@ impl<'a> Scheduler<'a> {
         skel.maps.rodata_data.has_little_cores = topo.has_little_cores();
         skel.maps.rodata_data.disable_topology = disable_topology;
         skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
+        skel.maps.rodata_data.antistall_sec = opts.antistall_sec;
         if opts.monitor_disable {
             skel.maps.rodata_data.monitor_disable = opts.monitor_disable;
         }
+        if opts.disable_antistall {
+            skel.maps.rodata_data.enable_antistall = !opts.disable_antistall;
+        }
         for (cpu, sib) in cpu_pool.sibling_cpu.iter().enumerate() {
             skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
         }