Skip to content

Commit

Permalink
Merge pull request #736 from multics69/scx-futex-v1
Browse files Browse the repository at this point in the history
scx_lavd: split main.bpf.c into multiple files
  • Loading branch information
multics69 authored Oct 5, 2024
2 parents 719e98a + 7c5c83a commit a673dcf
Show file tree
Hide file tree
Showing 9 changed files with 1,934 additions and 1,867 deletions.
141 changes: 1 addition & 140 deletions scheds/rust/scx_lavd/src/bpf/intf.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,56 +47,11 @@ extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
/*
* common constants
*/
enum consts {
CLOCK_BOOTTIME = 7,
CACHELINE_SIZE = 64,
NSEC_PER_USEC = 1000ULL,
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC),
LAVD_TIME_INFINITY_NS = SCX_SLICE_INF,
LAVD_MAX_RETRY = 4,

LAVD_TARGETED_LATENCY_NS = (20ULL * NSEC_PER_MSEC),
LAVD_SLICE_MIN_NS = (300ULL * NSEC_PER_USEC), /* min time slice */
LAVD_SLICE_MAX_NS = (3ULL * NSEC_PER_MSEC), /* max time slice */
LAVD_SLICE_UNDECIDED = SCX_SLICE_INF,

LAVD_LC_FREQ_MAX = 1000000,
LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS,
LAVD_LC_RUNTIME_SHIFT = 15,
LAVD_LC_WAKEUP_FT = 30,
LAVD_LC_KTHREAD_FT = 30,

LAVD_SLICE_BOOST_MAX_FT = 3, /* maximum additional 3x of slice */
LAVD_SLICE_BOOST_MAX_STEP = 6, /* 6 slice exhausitions in a row */
LAVD_NEW_PROC_PENALITY = 5,
LAVD_GREEDY_RATIO_NEW = (1000 * LAVD_NEW_PROC_PENALITY),

LAVD_CPU_UTIL_MAX = 1000, /* 100.0% */
LAVD_CPU_UTIL_MAX_FOR_CPUPERF = 850, /* 85.0% */
LAVD_CPU_ID_HERE = ((u32)-2),
LAVD_CPU_ID_NONE = ((u32)-1),
enum {
LAVD_CPU_ID_MAX = 512,

LAVD_PREEMPT_KICK_MARGIN = (1ULL * NSEC_PER_MSEC),
LAVD_PREEMPT_TICK_MARGIN = (100ULL * NSEC_PER_USEC),

LAVD_SYS_STAT_INTERVAL_NS = (50ULL * NSEC_PER_MSEC),
LAVD_SYS_STAT_DECAY_TIMES = (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
LAVD_CC_PER_CORE_MAX_CTUIL = 500, /* maximum per-core CPU utilization */
LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
LAVD_CC_NR_ACTIVE_MIN = 1, /* num of mininum active cores */
LAVD_CC_NR_OVRFLW = 1, /* num of overflow cores */
LAVD_CC_CPU_PIN_INTERVAL = (1ULL * LAVD_TIME_ONE_SEC),
LAVD_CC_CPU_PIN_INTERVAL_DIV = (LAVD_CC_CPU_PIN_INTERVAL /
LAVD_SYS_STAT_INTERVAL_NS),

LAVD_AP_HIGH_UTIL = 700, /* balanced mode when 10% < cpu util <= 40%,
performance mode when cpu util > 40% */

LAVD_CPDOM_MAX_NR = 32, /* maximum number of compute domain */
LAVD_CPDOM_MAX_DIST = 4, /* maximum distance from one compute domain to another */
LAVD_CPDOM_STARV_NS = (5ULL * NSEC_PER_MSEC),

LAVD_STATUS_STR_LEN = 5, /* {LR: Latency-critical, Regular}
{HI: performance-Hungry, performance-Insensitive}
Expand Down Expand Up @@ -139,100 +94,6 @@ struct sys_stat {
volatile u64 nr_lc_on_big; /* latency-critical tasks scheduled on big core */
};

/*
* Compute domain context
* - system > numa node > llc domain > compute domain per core type (P or E)
*/
struct cpdom_ctx {
u64 id; /* id of this compute domain (== dsq_id) */
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
u64 last_consume_clk; /* when the associated DSQ was consumed */
u8 is_big; /* is it a big core or little core? */
u8 is_active; /* if this compute domain is active */
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
} __attribute__((aligned(CACHELINE_SIZE)));

/*
* CPU context
*/
struct cpu_ctx {
/*
* Information used to keep track of CPU utilization
*/
volatile u64 util; /* average of the CPU utilization */
volatile u64 idle_total; /* total idle time so far */
volatile u64 idle_start_clk; /* when the CPU becomes idle */

/*
* Information used to keep track of load
*/
volatile u64 load_actual; /* actual load of runnable tasks */
volatile u64 load_run_time_ns; /* total runtime of runnable tasks */
volatile u64 tot_svc_time; /* total service time on a CPU */
volatile u64 last_kick_clk; /* when the CPU was kicked */

/*
* Information for cpu hotplug
*/
u64 online_clk; /* when a CPU becomes online */
u64 offline_clk; /* when a CPU becomes offline */

/*
* Information used to keep track of latency criticality
*/
volatile u32 max_lat_cri; /* maximum latency criticality */
volatile u32 sum_lat_cri; /* sum of latency criticality */
volatile u32 nr_sched; /* number of schedules */

/*
* Information used to keep track of performance criticality
*/
volatile u64 sum_perf_cri; /* sum of performance criticality */
volatile u64 min_perf_cri; /* mininum performance criticality */
volatile u64 max_perf_cri; /* maximum performance criticality */

/*
* Information of a current running task for preemption
*/
volatile u64 stopping_tm_est_ns; /* estimated stopping time */
volatile u16 lat_cri; /* latency criticality */
volatile u8 is_online; /* is this CPU online? */
s32 cpu_id; /* cpu id */

/*
* Information for CPU frequency scaling
*/
u32 cpuperf_cur; /* CPU's current performance target */
u32 cpuperf_task; /* task's CPU performance target */
u32 cpuperf_avg; /* EWMA of task's CPU performance target */

/*
* Fields for core compaction
*
*/
u16 capacity; /* CPU capacity based on 1000 */
u8 big_core; /* is it a big core? */
u8 turbo_core; /* is it a turbo core? */
u8 cpdom_id; /* compute domain id (== dsq_id) */
u8 cpdom_alt_id; /* compute domain id of anternative type (== dsq_id) */
u8 cpdom_poll_pos; /* index to check if a DSQ of a compute domain is starving */
struct bpf_cpumask __kptr *tmp_a_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_o_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_t_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_t2_mask; /* temporary cpu mask */

/*
* Information for statistics.
*/
volatile u32 nr_migration; /* number of migrations */
volatile u32 nr_preemption; /* number of migrations */
volatile u32 nr_greedy; /* number of greedy tasks scheduled */
volatile u32 nr_perf_cri;
volatile u32 nr_lat_cri;
} __attribute__((aligned(CACHELINE_SIZE)));

/*
* Task context
*/
Expand Down
116 changes: 116 additions & 0 deletions scheds/rust/scx_lavd/src/bpf/introspec.bpf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Valve Corporation.
* Author: Changwoo Min <[email protected]>
*/

/*
* To be included to the main.bpf.c
*/

/*
* Introspection commands
*/
struct introspec intrspc;

struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 16 * 1024 /* 16 KB */);
} introspec_msg SEC(".maps");

static __always_inline
int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc, u32 cpu_id)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
struct cpu_ctx *cpuc;
struct msg_task_ctx *m;

cpuc = get_cpu_ctx_id(cpu_id);
if (!cpuc)
return -EINVAL;

m = bpf_ringbuf_reserve(&introspec_msg, sizeof(*m), 0);
if (!m)
return -ENOMEM;

m->hdr.kind = LAVD_MSG_TASKC;
m->taskc_x.pid = p->pid;
memcpy(m->taskc_x.comm, p->comm, TASK_COMM_LEN);
m->taskc_x.static_prio = get_nice_prio(p);
m->taskc_x.cpu_util = cpuc->util / 10;
m->taskc_x.cpu_id = cpu_id;
m->taskc_x.avg_lat_cri = stat_cur->avg_lat_cri;
m->taskc_x.thr_perf_cri = stat_cur->thr_perf_cri;
m->taskc_x.nr_active = stat_cur->nr_active;
m->taskc_x.cpuperf_cur = cpuc->cpuperf_cur;

m->taskc_x.stat[0] = is_lat_cri(taskc, stat_cur) ? 'L' : 'R';
m->taskc_x.stat[1] = is_perf_cri(taskc, stat_cur) ? 'H' : 'I';
m->taskc_x.stat[2] = cpuc->big_core ? 'B' : 'T';
m->taskc_x.stat[3] = is_greedy(taskc) ? 'G' : 'E';
m->taskc_x.stat[4] = taskc->victim_cpu >= 0 ? 'P' : 'N';
m->taskc_x.stat[5] = '\0';

memcpy(&m->taskc, taskc, sizeof(m->taskc));

bpf_ringbuf_submit(m, 0);

return 0;
}

static void proc_introspec_sched_n(struct task_struct *p,
struct task_ctx *taskc, u32 cpu_id)
{
u64 cur_nr, prev_nr;
int i;

/* introspec_arg is the number of schedules remaining */
cur_nr = intrspc.arg;

/*
* Note that the bounded retry (@LAVD_MAX_RETRY) does *not *guarantee*
* to decrement introspec_arg. However, it is unlikely to happen. Even
* if it happens, it is nothing but a matter of delaying a message
* delivery. That's because other threads will try and succeed the CAS
* operation eventually. So this is good enough. ;-)
*/
for (i = 0; cur_nr > 0 && i < LAVD_MAX_RETRY; i++) {
prev_nr = __sync_val_compare_and_swap(
&intrspc.arg, cur_nr, cur_nr - 1);
/* CAS success: submit a message and done */
if (prev_nr == cur_nr) {
submit_task_ctx(p, taskc, cpu_id);
break;
}
/* CAS failure: retry */
cur_nr = prev_nr;
}
}

static void proc_introspec_pid(struct task_struct *p, struct task_ctx *taskc,
u32 cpu_id)
{
if (p->pid == intrspc.arg)
submit_task_ctx(p, taskc, cpu_id);
}

static void try_proc_introspec_cmd(struct task_struct *p,
struct task_ctx *taskc, u32 cpu_id)
{
if (LAVD_CPU_ID_HERE == cpu_id)
cpu_id = bpf_get_smp_processor_id();

switch(intrspc.cmd) {
case LAVD_CMD_SCHED_N:
proc_introspec_sched_n(p, taskc, cpu_id);
break;
case LAVD_CMD_NOP:
/* do nothing */
break;
default:
scx_bpf_error("Unknown introspec command: %d", intrspc.cmd);
break;
}
}


Loading

0 comments on commit a673dcf

Please sign in to comment.