Merge pull request #736 from multics69/scx-futex-v1

scx_lavd: split main.bpf.c into multiple files
sched-ext · Oct 5, 2024 · a673dcf · a673dcf
2 parents 719e98a + 7c5c83a
commit a673dcf
Show file tree

Hide file tree

Showing 9 changed files with 1,934 additions and 1,867 deletions.
diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h
@@ -47,56 +47,11 @@ extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
 /*
  * common constants
  */
-enum consts {
-	CLOCK_BOOTTIME			= 7,
-	CACHELINE_SIZE			= 64,
-	NSEC_PER_USEC			= 1000ULL,
-	NSEC_PER_MSEC			= (1000ULL * NSEC_PER_USEC),
-	LAVD_TIME_ONE_SEC		= (1000ULL * NSEC_PER_MSEC),
-	LAVD_TIME_INFINITY_NS		= SCX_SLICE_INF,
-	LAVD_MAX_RETRY			= 4,
-
-	LAVD_TARGETED_LATENCY_NS	= (20ULL * NSEC_PER_MSEC),
-	LAVD_SLICE_MIN_NS		= (300ULL * NSEC_PER_USEC), /* min time slice */
-	LAVD_SLICE_MAX_NS		= (3ULL * NSEC_PER_MSEC), /* max time slice */
-	LAVD_SLICE_UNDECIDED		= SCX_SLICE_INF,
-
-	LAVD_LC_FREQ_MAX		= 1000000,
-	LAVD_LC_RUNTIME_MAX		= LAVD_TARGETED_LATENCY_NS,
-	LAVD_LC_RUNTIME_SHIFT		= 15,
-	LAVD_LC_WAKEUP_FT		= 30,
-	LAVD_LC_KTHREAD_FT		= 30,
-
-	LAVD_SLICE_BOOST_MAX_FT		= 3, /* maximum additional 3x of slice */
-	LAVD_SLICE_BOOST_MAX_STEP	= 6, /* 6 slice exhausitions in a row */
-	LAVD_NEW_PROC_PENALITY		= 5,
-	LAVD_GREEDY_RATIO_NEW		= (1000 * LAVD_NEW_PROC_PENALITY),
-
-	LAVD_CPU_UTIL_MAX		= 1000, /* 100.0% */
-	LAVD_CPU_UTIL_MAX_FOR_CPUPERF	= 850, /* 85.0% */
-	LAVD_CPU_ID_HERE		= ((u32)-2),
-	LAVD_CPU_ID_NONE		= ((u32)-1),
+enum {
 	LAVD_CPU_ID_MAX			= 512,
 
-	LAVD_PREEMPT_KICK_MARGIN	= (1ULL * NSEC_PER_MSEC),
-	LAVD_PREEMPT_TICK_MARGIN	= (100ULL * NSEC_PER_USEC),
-
-	LAVD_SYS_STAT_INTERVAL_NS	= (50ULL * NSEC_PER_MSEC),
-	LAVD_SYS_STAT_DECAY_TIMES	= (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
-	LAVD_CC_PER_CORE_MAX_CTUIL	= 500, /* maximum per-core CPU utilization */
-	LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
-	LAVD_CC_NR_ACTIVE_MIN		= 1, /* num of mininum active cores */
-	LAVD_CC_NR_OVRFLW		= 1, /* num of overflow cores */
-	LAVD_CC_CPU_PIN_INTERVAL	= (1ULL * LAVD_TIME_ONE_SEC),
-	LAVD_CC_CPU_PIN_INTERVAL_DIV	= (LAVD_CC_CPU_PIN_INTERVAL /
-					   LAVD_SYS_STAT_INTERVAL_NS),
-
-	LAVD_AP_HIGH_UTIL		= 700, /* balanced mode when 10% < cpu util <= 40%,
-						  performance mode when cpu util > 40% */
-
 	LAVD_CPDOM_MAX_NR		= 32, /* maximum number of compute domain */
 	LAVD_CPDOM_MAX_DIST		= 4,  /* maximum distance from one compute domain to another */
-	LAVD_CPDOM_STARV_NS		= (5ULL * NSEC_PER_MSEC),
 
 	LAVD_STATUS_STR_LEN		= 5, /* {LR: Latency-critical, Regular}
 						{HI: performance-Hungry, performance-Insensitive}
@@ -139,100 +94,6 @@ struct sys_stat {
 	volatile u64	nr_lc_on_big;	/* latency-critical tasks scheduled on big core */
 };
 
-/*
- * Compute domain context
- * - system > numa node > llc domain > compute domain per core type (P or E)
- */
-struct cpdom_ctx {
-	u64	id;				    /* id of this compute domain (== dsq_id) */
-	u64	alt_id;				    /* id of the closest compute domain of alternative type (== dsq id) */
-	u64	last_consume_clk;		    /* when the associated DSQ was consumed */
-	u8	is_big;				    /* is it a big core or little core? */
-	u8	is_active;			    /* if this compute domain is active */
-	u8	nr_neighbors[LAVD_CPDOM_MAX_DIST];  /* number of neighbors per distance */
-	u64	neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
-	u64	__cpumask[LAVD_CPU_ID_MAX/64];	    /* cpumasks belongs to this compute domain */
-} __attribute__((aligned(CACHELINE_SIZE)));
-
-/*
- * CPU context
- */
-struct cpu_ctx {
-	/* 
-	 * Information used to keep track of CPU utilization
-	 */
-	volatile u64	util;		/* average of the CPU utilization */
-	volatile u64	idle_total;	/* total idle time so far */
-	volatile u64	idle_start_clk;	/* when the CPU becomes idle */
-
-	/*
-	 * Information used to keep track of load
-	 */
-	volatile u64	load_actual;	/* actual load of runnable tasks */
-	volatile u64	load_run_time_ns; /* total runtime of runnable tasks */
-	volatile u64	tot_svc_time;	/* total service time on a CPU */
-	volatile u64	last_kick_clk;	/* when the CPU was kicked */
-
-	/*
-	 * Information for cpu hotplug
-	 */
-	u64		online_clk;	/* when a CPU becomes online */
-	u64		offline_clk;	/* when a CPU becomes offline */
-
-	/*
-	 * Information used to keep track of latency criticality
-	 */
-	volatile u32	max_lat_cri;	/* maximum latency criticality */
-	volatile u32	sum_lat_cri;	/* sum of latency criticality */
-	volatile u32	nr_sched;	/* number of schedules */
-
-	/*
-	 * Information used to keep track of performance criticality
-	 */
-	volatile u64	sum_perf_cri;	/* sum of performance criticality */
-	volatile u64	min_perf_cri;	/* mininum performance criticality */
-	volatile u64	max_perf_cri;	/* maximum performance criticality */
-
-	/*
-	 * Information of a current running task for preemption
-	 */
-	volatile u64	stopping_tm_est_ns; /* estimated stopping time */
-	volatile u16	lat_cri;	/* latency criticality */
-	volatile u8	is_online;	/* is this CPU online? */
-	s32		cpu_id;		/* cpu id */
-
-	/*
-	 * Information for CPU frequency scaling
-	 */
-	u32		cpuperf_cur;	/* CPU's current performance target */
-	u32		cpuperf_task;	/* task's CPU performance target */
-	u32		cpuperf_avg;	/* EWMA of task's CPU performance target */
-
-	/*
-	 * Fields for core compaction
-	 *
-	 */
-	u16		capacity;	/* CPU capacity based on 1000 */
-	u8		big_core;	/* is it a big core? */
-	u8		turbo_core;	/* is it a turbo core? */
-	u8		cpdom_id;	/* compute domain id (== dsq_id) */
-	u8		cpdom_alt_id;	/* compute domain id of anternative type (== dsq_id) */
-	u8		cpdom_poll_pos;	/* index to check if a DSQ of a compute domain is starving */
-	struct bpf_cpumask __kptr *tmp_a_mask;	/* temporary cpu mask */
-	struct bpf_cpumask __kptr *tmp_o_mask;	/* temporary cpu mask */
-	struct bpf_cpumask __kptr *tmp_t_mask;	/* temporary cpu mask */
-	struct bpf_cpumask __kptr *tmp_t2_mask;	/* temporary cpu mask */
-
-	/*
-	 * Information for statistics.
-	 */
-	volatile u32	nr_migration;	/* number of migrations */
-	volatile u32	nr_preemption;	/* number of migrations */
-	volatile u32	nr_greedy;	/* number of greedy tasks scheduled */
-	volatile u32	nr_perf_cri;
-	volatile u32	nr_lat_cri;
-} __attribute__((aligned(CACHELINE_SIZE)));
-
 /*
  * Task context
  */

diff --git a/scheds/rust/scx_lavd/src/bpf/introspec.bpf.c b/scheds/rust/scx_lavd/src/bpf/introspec.bpf.c
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023, 2024 Valve Corporation.
+ * Author: Changwoo Min <[email protected]>
+ */
+
+/*
+ * To be included to the main.bpf.c
+ */
+
+/*
+ * Introspection commands
+ */
+struct introspec intrspc;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, 16 * 1024 /* 16 KB */);
+} introspec_msg SEC(".maps");
+
+static __always_inline
+int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc, u32 cpu_id)
+{
+	struct sys_stat *stat_cur = get_sys_stat_cur();
+	struct cpu_ctx *cpuc;
+	struct msg_task_ctx *m;
+
+	cpuc = get_cpu_ctx_id(cpu_id);
+	if (!cpuc)
+		return -EINVAL;
+
+	m = bpf_ringbuf_reserve(&introspec_msg, sizeof(*m), 0);
+	if (!m)
+		return -ENOMEM;
+
+	m->hdr.kind = LAVD_MSG_TASKC;
+	m->taskc_x.pid = p->pid;
+	memcpy(m->taskc_x.comm, p->comm, TASK_COMM_LEN);
+	m->taskc_x.static_prio = get_nice_prio(p);
+	m->taskc_x.cpu_util = cpuc->util / 10;
+	m->taskc_x.cpu_id = cpu_id;
+	m->taskc_x.avg_lat_cri = stat_cur->avg_lat_cri;
+	m->taskc_x.thr_perf_cri = stat_cur->thr_perf_cri;
+	m->taskc_x.nr_active = stat_cur->nr_active;
+	m->taskc_x.cpuperf_cur = cpuc->cpuperf_cur;
+
+	m->taskc_x.stat[0] = is_lat_cri(taskc, stat_cur) ? 'L' : 'R';
+	m->taskc_x.stat[1] = is_perf_cri(taskc, stat_cur) ? 'H' : 'I';
+	m->taskc_x.stat[2] = cpuc->big_core ? 'B' : 'T';
+	m->taskc_x.stat[3] = is_greedy(taskc) ? 'G' : 'E';
+	m->taskc_x.stat[4] = taskc->victim_cpu >= 0 ? 'P' : 'N';
+	m->taskc_x.stat[5] = '\0';
+
+	memcpy(&m->taskc, taskc, sizeof(m->taskc));
+
+	bpf_ringbuf_submit(m, 0);
+
+	return 0;
+}
+
+static void proc_introspec_sched_n(struct task_struct *p,
+				   struct task_ctx *taskc, u32 cpu_id)
+{
+	u64 cur_nr, prev_nr;
+	int i;
+
+	/* introspec_arg is the number of schedules remaining */
+	cur_nr = intrspc.arg;
+
+	/*
+	 * Note that the bounded retry (@LAVD_MAX_RETRY) does *not *guarantee*
+	 * to decrement introspec_arg. However, it is unlikely to happen. Even
+	 * if it happens, it is nothing but a matter of delaying a message
+	 * delivery. That's because other threads will try and succeed the CAS
+	 * operation eventually. So this is good enough. ;-)
+	 */
+	for (i = 0; cur_nr > 0 && i < LAVD_MAX_RETRY; i++) {
+		prev_nr = __sync_val_compare_and_swap(
+				&intrspc.arg, cur_nr, cur_nr - 1);
+		/* CAS success: submit a message and done */
+		if (prev_nr == cur_nr) {
+			submit_task_ctx(p, taskc, cpu_id);
+			break;
+		}
+		/* CAS failure: retry */
+		cur_nr = prev_nr;
+	}
+}
+
+static void proc_introspec_pid(struct task_struct *p, struct task_ctx *taskc,
+			       u32 cpu_id)
+{
+	if (p->pid == intrspc.arg)
+		submit_task_ctx(p, taskc, cpu_id);
+}
+
+static void try_proc_introspec_cmd(struct task_struct *p,
+				   struct task_ctx *taskc, u32 cpu_id)
+{
+	if (LAVD_CPU_ID_HERE == cpu_id)
+		cpu_id = bpf_get_smp_processor_id();
+
+	switch(intrspc.cmd) {
+	case LAVD_CMD_SCHED_N:
+		proc_introspec_sched_n(p, taskc, cpu_id);
+		break;
+	case LAVD_CMD_NOP:
+		/* do nothing */
+		break;
+	default:
+		scx_bpf_error("Unknown introspec command: %d", intrspc.cmd);
+		break;
+	}
+}
+
+