diff --git a/.github/workflows/caching-build.yml b/.github/workflows/caching-build.yml index e94352954..68867c42f 100644 --- a/.github/workflows/caching-build.yml +++ b/.github/workflows/caching-build.yml @@ -13,7 +13,7 @@ jobs: - run: sudo apt install -y git --no-install-recommends # get latest head commit of sched_ext for-next - - run: echo "SCHED_EXT_KERNEL_COMMIT=KERNEL_COMMIT_SHA7_HERE" >> $GITHUB_ENV + - run: echo "SCHED_EXT_KERNEL_COMMIT=21f4c19" >> $GITHUB_ENV - uses: actions/checkout@v4 # use cached kernel if available, create after job if not @@ -93,7 +93,7 @@ jobs: run: cargo install virtiofsd && sudo cp -a ~/.cargo/bin/virtiofsd /usr/lib/ # get latest head commit of sched_ext for-next - - run: echo "SCHED_EXT_KERNEL_COMMIT=KERNEL_COMMIT_SHA7_HERE" >> $GITHUB_ENV + - run: echo "SCHED_EXT_KERNEL_COMMIT=21f4c19" >> $GITHUB_ENV # use cached kernel if available, create after job if not - name: Cache Kernel @@ -154,7 +154,7 @@ jobs: run: cargo install virtiofsd && sudo cp -a ~/.cargo/bin/virtiofsd /usr/lib/ # get latest head commit of sched_ext for-next - - run: echo "SCHED_EXT_KERNEL_COMMIT=KERNEL_COMMIT_SHA7_HERE" >> $GITHUB_ENV + - run: echo "SCHED_EXT_KERNEL_COMMIT=21f4c19" >> $GITHUB_ENV # cache bzImage alone for rust tests - name: Cache bzImage id: cache-bzImage @@ -200,7 +200,7 @@ jobs: run: cargo install virtiofsd && sudo cp -a ~/.cargo/bin/virtiofsd /usr/lib/ # get latest head commit of sched_ext for-next - - run: echo "SCHED_EXT_KERNEL_COMMIT=KERNEL_COMMIT_SHA7_HERE" >> $GITHUB_ENV + - run: echo "SCHED_EXT_KERNEL_COMMIT=21f4c19" >> $GITHUB_ENV # cache bzImage alone for rust tests - name: Cache bzImage id: cache-bzImage diff --git a/rust/scx_utils/src/bpf_builder.rs b/rust/scx_utils/src/bpf_builder.rs index 6830ce236..7e8a13c9d 100644 --- a/rust/scx_utils/src/bpf_builder.rs +++ b/rust/scx_utils/src/bpf_builder.rs @@ -362,7 +362,7 @@ impl BpfBuilder { } /// Return `(VER, SHA1)` from which the bulit-in `vmlinux.h` is generated. - pub fn vmlinux_h_ver_sha1() -> (String, String) { + pub fn vmlinux_h_ver_sha1() -> String { let mut ar = tar::Archive::new(Self::BPF_H_TAR); for file in ar.entries().unwrap() { @@ -378,7 +378,7 @@ impl BpfBuilder { .to_string_lossy() .to_string(); - return sscanf!(name, "vmlinux-v{String}-g{String}.h").unwrap(); + return sscanf!(name, "vmlinux-{String}.h").unwrap(); } panic!("vmlinux/vmlinux.h not found"); @@ -586,15 +586,10 @@ mod tests { #[test] fn test_vmlinux_h_ver_sha1() { - let (ver, sha1) = super::BpfBuilder::vmlinux_h_ver_sha1(); + let ver = super::BpfBuilder::vmlinux_h_ver_sha1(); - println!("vmlinux.h: ver={:?} sha1={:?}", &ver, &sha1,); + println!("vmlinux.h: ver={:?}", &ver); - assert!(regex::Regex::new(r"^([1-9][0-9]*\.[1-9][0-9][a-z0-9-]*)$") - .unwrap() - .is_match(&ver)); - assert!(regex::Regex::new(r"^[0-9a-z]{12}$") - .unwrap() - .is_match(&sha1)); + assert!(regex::Regex::new(r"^[a-f0-9]{7}$").unwrap().is_match(&ver)); } } diff --git a/rust/scx_utils/src/compat.rs b/rust/scx_utils/src/compat.rs index c2d977b0e..977457d02 100644 --- a/rust/scx_utils/src/compat.rs +++ b/rust/scx_utils/src/compat.rs @@ -164,9 +164,9 @@ macro_rules! unwrap_or_break { pub fn check_min_requirements() -> Result<()> { // ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext // is the current minimum required kernel version. - if let Ok(false) | Err(_) = struct_has_field("sched_ext_ops", "dump") { - bail!("sched_ext_ops.dump() missing, kernel too old?"); - } + // if let Ok(false) | Err(_) = struct_has_field("sched_ext_ops", "dump") { + // bail!("sched_ext_ops.dump() missing, kernel too old?"); + // } Ok(()) } @@ -187,21 +187,21 @@ macro_rules! scx_ops_open { }; let ops = skel.struct_ops.[<$ops _mut>](); - let path = std::path::Path::new("/sys/kernel/sched_ext/hotplug_seq"); - - let val = match std::fs::read_to_string(&path) { - Ok(val) => val, - Err(_) => { - break 'block Err(anyhow::anyhow!("Failed to open or read file {:?}", path)); - } - }; - - ops.hotplug_seq = match val.trim().parse::() { - Ok(parsed) => parsed, - Err(_) => { - break 'block Err(anyhow::anyhow!("Failed to parse hotplug seq {}", val)); - } - }; + // let path = std::path::Path::new("/sys/kernel/sched_ext/hotplug_seq"); + + // let val = match std::fs::read_to_string(&path) { + // Ok(val) => val, + // Err(_) => { + // break 'block Err(anyhow::anyhow!("Failed to open or read file {:?}", path)); + // } + // }; + + // ops.hotplug_seq = match val.trim().parse::() { + // Ok(parsed) => parsed, + // Err(_) => { + // break 'block Err(anyhow::anyhow!("Failed to parse hotplug seq {}", val)); + // } + // }; let result : Result, anyhow::Error> = Ok(skel); result @@ -218,7 +218,7 @@ macro_rules! scx_ops_open { macro_rules! scx_ops_load { ($skel: expr, $ops: ident, $uei: ident) => { 'block: { scx_utils::paste! { - scx_utils::uei_set_size!($skel, $ops, $uei); + //scx_utils::uei_set_size!($skel, $ops, $uei); $skel.load().context("Failed to load BPF program") } }}; diff --git a/rust/scx_utils/src/lib.rs b/rust/scx_utils/src/lib.rs index e5272d3e6..2e3444bcc 100644 --- a/rust/scx_utils/src/lib.rs +++ b/rust/scx_utils/src/lib.rs @@ -41,14 +41,14 @@ pub use bpf_builder::BpfBuilder; mod builder; pub use builder::Builder; -mod user_exit_info; -pub use user_exit_info::ScxConsts; -pub use user_exit_info::ScxExitKind; -pub use user_exit_info::UeiDumpPtr; -pub use user_exit_info::UserExitInfo; -pub use user_exit_info::SCX_ECODE_ACT_RESTART; -pub use user_exit_info::SCX_ECODE_RSN_HOTPLUG; -pub use user_exit_info::UEI_DUMP_PTR_MUTEX; +// mod user_exit_info; +// // pub use user_exit_info::ScxConsts; +// // pub use user_exit_info::ScxExitKind; +// pub use user_exit_info::UeiDumpPtr; +// pub use user_exit_info::UserExitInfo; +// pub use user_exit_info::SCX_ECODE_ACT_RESTART; +// pub use user_exit_info::SCX_ECODE_RSN_HOTPLUG; +// pub use user_exit_info::UEI_DUMP_PTR_MUTEX; pub mod build_id; pub mod compat; diff --git a/scheds/include/scx/compat.bpf.h b/scheds/include/scx/compat.bpf.h index 3d2fe1208..5fb4dffb3 100644 --- a/scheds/include/scx/compat.bpf.h +++ b/scheds/include/scx/compat.bpf.h @@ -15,6 +15,97 @@ __ret; \ }) +/* + * %SCX_KICK_IDLE is a later addition. To support both before and after, use + * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it. + * Users can use %SCX_KICK_IDLE directly in the future. + */ +#define __COMPAT_SCX_KICK_IDLE \ + __COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE) + +/* + * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See + * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. This can be dropped in the + * future. + */ +void scx_bpf_switch_all(void) __ksym __weak; + +static inline void __COMPAT_scx_bpf_switch_all(void) +{ + scx_bpf_switch_all(); +} + +/* + * scx_bpf_exit() is a new addition. Fall back to scx_bpf_error() if + * unavailable. Users can use scx_bpf_exit() directly in the future. + */ +#define __COMPAT_scx_bpf_exit(code, fmt, args...) \ +({ \ + if (bpf_ksym_exists(scx_bpf_exit_bstr)) \ + scx_bpf_exit((code), fmt, ##args); \ + else \ + scx_bpf_error(fmt, ##args); \ +}) + +/* + * scx_bpf_dump() is a new addition. Ignore if unavailable. Users can use + * scx_bpf_dump() directly in the future. + */ +#define __COMPAT_scx_bpf_dump(fmt, args...) \ +({ \ + if (bpf_ksym_exists(scx_bpf_dump_bstr)) \ + scx_bpf_dump(fmt, ##args); \ +}) + +/* + * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. No good + * way to noop these kfuncs. Provide a test macro. Users can assume existence in + * the future. + */ +#define __COMPAT_HAS_CPUMASKS \ + bpf_ksym_exists(scx_bpf_nr_cpu_ids) + +/* + * cpuperf is new. The followings become noop on older kernels. Callers can be + * updated to call cpuperf kfuncs directly in the future. + */ +static inline u32 __COMPAT_scx_bpf_cpuperf_cap(s32 cpu) +{ + if (bpf_ksym_exists(scx_bpf_cpuperf_cap)) + return scx_bpf_cpuperf_cap(cpu); + else + return 1024; +} + +static inline u32 __COMPAT_scx_bpf_cpuperf_cur(s32 cpu) +{ + if (bpf_ksym_exists(scx_bpf_cpuperf_cur)) + return scx_bpf_cpuperf_cur(cpu); + else + return 1024; +} + +static inline void __COMPAT_scx_bpf_cpuperf_set(s32 cpu, u32 perf) +{ + if (bpf_ksym_exists(scx_bpf_cpuperf_set)) + return scx_bpf_cpuperf_set(cpu, perf); +} + +/* + * Iteration and scx_bpf_consume_task() are new. The following become noop on + * older kernels. The users can switch to bpf_for_each(scx_dsq) and directly + * call scx_bpf_consume_task() in the future. + */ +#define __COMPAT_DSQ_FOR_EACH(p, dsq_id, flags) \ + if (bpf_ksym_exists(bpf_iter_scx_dsq_new)) \ + bpf_for_each(scx_dsq, (p), (dsq_id), (flags)) + +static inline bool __COMPAT_scx_bpf_consume_task(struct bpf_iter_scx_dsq *it, + struct task_struct *p) +{ + return false; +} + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). @@ -26,3 +117,4 @@ }; #endif /* __SCX_COMPAT_BPF_H */ + diff --git a/scheds/include/scx/compat.h b/scheds/include/scx/compat.h index cc56ff9aa..70021b999 100644 --- a/scheds/include/scx/compat.h +++ b/scheds/include/scx/compat.h @@ -143,8 +143,8 @@ static inline long scx_hotplug_seq(void) #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ \ - SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ - "sched_ext_ops.dump() missing, kernel too old?"); \ + /* SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), */ \ + /* "sched_ext_ops.dump() missing, kernel too old?"); */ \ \ __skel = __scx_name##__open(); \ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ diff --git a/scheds/include/scx/user_exit_info.h b/scheds/include/scx/user_exit_info.h index 891693ee6..8bdf3620f 100644 --- a/scheds/include/scx/user_exit_info.h +++ b/scheds/include/scx/user_exit_info.h @@ -18,7 +18,7 @@ enum uei_sizes { struct user_exit_info { int kind; - s64 exit_code; + // s64 exit_code; char reason[UEI_REASON_LEN]; char msg[UEI_MSG_LEN]; }; @@ -28,6 +28,15 @@ struct user_exit_info { #include "vmlinux.h" #include +static inline void uei_record(struct user_exit_info *uei, + const struct scx_exit_info *ei) +{ + bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason); + bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg); + /* use __sync to force memory barrier */ + __sync_val_compare_and_swap(&uei->kind, uei->kind, ei->type); +} + #define UEI_DEFINE(__name) \ char RESIZABLE_ARRAY(data, __name##_dump); \ const volatile u32 __name##_dump_len; \ @@ -38,13 +47,13 @@ struct user_exit_info { sizeof(__uei_name.reason), (__ei)->reason); \ bpf_probe_read_kernel_str(__uei_name.msg, \ sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ + /* bpf_probe_read_kernel_str(__uei_name##_dump, */ \ + /* __uei_name##_dump_len, (__ei)->dump); */ \ + /* if (bpf_core_field_exists((__ei)->exit_code)) */ \ + /* __uei_name.exit_code = (__ei)->exit_code; */ \ /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ + __sync_val_compare_and_swap(&__uei_name.type, __uei_name.type, \ + (__ei)->type); \ }) #else /* !__bpf__ */ @@ -53,11 +62,11 @@ struct user_exit_info { #include /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ -#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ - u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ - (__skel)->rodata->__uei_name##_dump_len = __len; \ - RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ -}) +// #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ +// u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ +// (__skel)->rodata->__uei_name##_dump_len = __len; \ +// RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ +// }) #define UEI_EXITED(__skel, __uei_name) ({ \ /* use __sync to force memory barrier */ \ @@ -66,18 +75,18 @@ struct user_exit_info { #define UEI_REPORT(__skel, __uei_name) ({ \ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ - char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ - if (__uei_dump[0] != '\0') { \ - fputs("\nDEBUG DUMP\n", stderr); \ - fputs("================================================================================\n\n", stderr); \ - fputs(__uei_dump, stderr); \ - fputs("\n================================================================================\n\n", stderr); \ - } \ + /* char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; *\ \ + /* if (__uei_dump[0] != '\0') { *\ \ + /* fputs("\nDEBUG DUMP\n", stderr); *\ \ + /* fputs("================================================================================\n\n", stderr); *\ \ + /* fputs(__uei_dump, stderr); *\ \ + /* fputs("\n================================================================================\n\n", stderr); *\ \ + /* } *\ \ fprintf(stderr, "EXIT: %s", __uei->reason); \ if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ fputs("\n", stderr); \ - __uei->exit_code; \ + /* __uei->exit_code; */ \ }) /* diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index d1ad9bca9..e1603d573 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -35,9 +35,14 @@ static u32 preempt_cursor; #define dbg(fmt, args...) do { if (debug) bpf_printk(fmt, ##args); } while (0) #define trace(fmt, args...) do { if (debug > 1) bpf_printk(fmt, ##args); } while (0) -#include "util.bpf.c" +#include "util.bpf.h" -UEI_DEFINE(uei); +#define __COMPAT_scx_bpf_error(fmt, args...) \ + do { \ + bpf_printk(fmt, ##args); \ + } while (0) + +struct user_exit_info uei; static inline bool vtime_before(u64 a, u64 b) { @@ -87,7 +92,7 @@ static __noinline u32 iter_layer_cpu_ctx(u32 layer_idx, int idx) offset -= nr_layers; if (offset > MAX_LAYERS) { - scx_bpf_error("invalid layer id %u", layer_idx); + __COMPAT_scx_bpf_error("invalid layer id %u", layer_idx); return 0; } return offset; @@ -117,7 +122,7 @@ static struct cpu_ctx *lookup_cpu_ctx(int cpu) cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu); if (!cctx) { - scx_bpf_error("no cpu_ctx for cpu %d", cpu); + __COMPAT_scx_bpf_error("no cpu_ctx for cpu %d", cpu); return NULL; } @@ -130,7 +135,7 @@ static u32 cpu_to_llc_id(s32 cpu_id) llc_ptr = MEMBER_VPTR(cpu_llc_id_map, [cpu_id]); if (!llc_ptr) { - scx_bpf_error("Couldn't look up llc ID for cpu %d", cpu_id); + __COMPAT_scx_bpf_error("Couldn't look up llc ID for cpu %d", cpu_id); return 0; } return *llc_ptr; @@ -150,7 +155,7 @@ struct { static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx) { if (idx < 0 || idx >= NR_GSTATS) { - scx_bpf_error("invalid global stat idx %d", idx); + __COMPAT_scx_bpf_error("invalid global stat idx %d", idx); return; } @@ -165,7 +170,7 @@ static void lstat_add(enum layer_stat_idx idx, struct layer *layer, if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx]))) (*vptr) += delta; else - scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx); + __COMPAT_scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx); } static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, @@ -195,7 +200,7 @@ static void adj_load(u32 layer_idx, s64 adj, u64 now) lockw = bpf_map_lookup_elem(&layer_load_locks, &layer_idx); if (!layer || !lockw) { - scx_bpf_error("Can't access layer%d or its load_lock", layer_idx); + __COMPAT_scx_bpf_error("Can't access layer%d or its load_lock", layer_idx); return; } @@ -205,7 +210,7 @@ static void adj_load(u32 layer_idx, s64 adj, u64 now) bpf_spin_unlock(&lockw->lock); if (debug && adj < 0 && (s64)layer->load < 0) - scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)", + __COMPAT_scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)", bpf_get_smp_processor_id(), layer_idx, layer->load, adj); } @@ -228,7 +233,7 @@ static struct cpumask *lookup_layer_cpumask(int idx) if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) { return (struct cpumask *)cpumaskw->cpumask; } else { - scx_bpf_error("no layer_cpumask"); + __COMPAT_scx_bpf_error("no layer_cpumask"); return NULL; } } @@ -255,7 +260,7 @@ static void refresh_cpumasks(int idx) */ barrier_var(cpumaskw); if (!cpumaskw || !cpumaskw->cpumask) { - scx_bpf_error("can't happen"); + __COMPAT_scx_bpf_error("can't happen"); return; } @@ -266,14 +271,14 @@ static void refresh_cpumasks(int idx) bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask); } } else { - scx_bpf_error("can't happen"); + __COMPAT_scx_bpf_error("can't happen"); } } // XXX - shouldn't be necessary layer = MEMBER_VPTR(layers, [idx]); if (!layer) { - scx_bpf_error("can't happen"); + __COMPAT_scx_bpf_error("can't happen"); return; } @@ -288,7 +293,7 @@ u32 llc_node_id(u32 llc_id) llc_ptr = MEMBER_VPTR(llc_numa_id_map, [llc_id]); if (!llc_ptr) { - scx_bpf_error("Couldn't look up llc ID for %d", llc_id); + __COMPAT_scx_bpf_error("Couldn't look up llc ID for %d", llc_id); return 0; } return *llc_ptr; @@ -314,6 +319,7 @@ struct task_ctx { struct bpf_cpumask __kptr *layered_cpumask; bool all_cpus_allowed; + bool dispatch_local; u64 runnable_at; u64 running_at; }; @@ -335,7 +341,7 @@ static struct task_ctx *lookup_task_ctx(struct task_struct *p) struct task_ctx *tctx = lookup_task_ctx_may_fail(p); if (!tctx) - scx_bpf_error("task_ctx lookup failed"); + __COMPAT_scx_bpf_error("task_ctx lookup failed"); return tctx; } @@ -343,7 +349,7 @@ static struct task_ctx *lookup_task_ctx(struct task_struct *p) static struct layer *lookup_layer(int idx) { if (idx < 0 || idx >= nr_layers) { - scx_bpf_error("invalid layer %d", idx); + __COMPAT_scx_bpf_error("invalid layer %d", idx); return NULL; } return &layers[idx]; @@ -376,7 +382,7 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, thread_head = &leader->signal->thread_head; if (!(next = bpf_task_acquire(leader))) { - scx_bpf_error("failed to acquire leader"); + __COMPAT_scx_bpf_error("failed to acquire leader"); return 0; } @@ -568,9 +574,8 @@ s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx, layer, true); if (cpu >= 0) { - lstat_inc(LSTAT_SEL_LOCAL, layer, cctx); u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns; - scx_bpf_dispatch(p, SCX_DSQ_LOCAL, layer_slice_ns, 0); + tctx->dispatch_local = true; return cpu; } else { return prev_cpu; @@ -588,7 +593,7 @@ bool pick_idle_cpu_and_kick(struct task_struct *p, s32 task_cpu, if (cpu >= 0) { lstat_inc(LSTAT_KICK, layer, cctx); - scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + scx_bpf_kick_cpu(cpu, 0 /*SCX_KICK_IDLE*/); return true; } else { return false; @@ -657,6 +662,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) !(layer = lookup_layer(tctx->layer))) return; + if (tctx->dispatch_local) { + tctx->dispatch_local = false; + lstat_inc(LSTAT_SEL_LOCAL, layer, cctx); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + try_preempt_first = cctx->try_preempt_first; cctx->try_preempt_first = false; u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns; @@ -1028,7 +1040,7 @@ static __noinline bool match_one(struct layer_match *match, case MATCH_TGID_EQUALS: return p->tgid == match->tgid; default: - scx_bpf_error("invalid match kind %d", match->kind); + __COMPAT_scx_bpf_error("invalid match kind %d", match->kind); return result; } } @@ -1134,7 +1146,7 @@ static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx) */ p->scx.dsq_vtime = layer->vtime_now; } else { - scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid); + __COMPAT_scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid); } if (tctx->layer < nr_layers - 1) @@ -1148,13 +1160,13 @@ static s32 create_save_cpumask(struct bpf_cpumask **kptr) cpumask = bpf_cpumask_create(); if (!cpumask) { - scx_bpf_error("Failed to create cpumask"); + __COMPAT_scx_bpf_error("Failed to create cpumask"); return -ENOMEM; } cpumask = bpf_kptr_xchg(kptr, cpumask); if (cpumask) { - scx_bpf_error("kptr already had cpumask"); + __COMPAT_scx_bpf_error("kptr already had cpumask"); bpf_cpumask_release(cpumask); } @@ -1171,7 +1183,7 @@ static s32 create_node(u32 node_id) nodec = bpf_map_lookup_elem(&node_data, &node_id); if (!nodec) { /* Should never happen, it's created statically at load time. */ - scx_bpf_error("No node%u", node_id); + __COMPAT_scx_bpf_error("No node%u", node_id); return -ENOENT; } nodec->id = node_id; @@ -1184,7 +1196,7 @@ static s32 create_node(u32 node_id) cpumask = nodec->cpumask; if (!cpumask) { bpf_rcu_read_unlock(); - scx_bpf_error("Failed to lookup node cpumask"); + __COMPAT_scx_bpf_error("Failed to lookup node cpumask"); return -ENOENT; } @@ -1193,7 +1205,7 @@ static s32 create_node(u32 node_id) nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]); if (!nmask) { - scx_bpf_error("array index error"); + __COMPAT_scx_bpf_error("array index error"); ret = -ENOENT; break; } @@ -1263,7 +1275,7 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p) } if (layer->perf > 0) - scx_bpf_cpuperf_set(task_cpu, layer->perf); + __COMPAT_scx_bpf_cpuperf_set(task_cpu, layer->perf); cctx->maybe_idle = false; } @@ -1358,7 +1370,7 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p, return; if (!all_cpumask) { - scx_bpf_error("NULL all_cpumask"); + __COMPAT_scx_bpf_error("NULL all_cpumask"); return; } @@ -1372,8 +1384,7 @@ void BPF_STRUCT_OPS(layered_cpu_release, s32 cpu, scx_bpf_reenqueue_local(); } -s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, - struct scx_init_task_args *args) +s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p, struct scx_enable_args *args) { struct task_ctx *tctx; struct bpf_cpumask *cpumask; @@ -1386,7 +1397,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, tctx = bpf_task_storage_get(&task_ctxs, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!tctx) { - scx_bpf_error("task_ctx allocation failure"); + __COMPAT_scx_bpf_error("task_ctx allocation failure"); return -ENOMEM; } @@ -1410,7 +1421,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, tctx->all_cpus_allowed = bpf_cpumask_subset((const struct cpumask *)all_cpumask, p->cpus_ptr); else - scx_bpf_error("missing all_cpumask"); + __COMPAT_scx_bpf_error("missing all_cpumask"); /* * We are matching cgroup hierarchy path directly rather than the CPU @@ -1422,8 +1433,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, return 0; } -void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p, - struct scx_exit_task_args *args) +void BPF_STRUCT_OPS(layered_disable, struct task_struct *p) { struct cpu_ctx *cctx; struct task_ctx *tctx; @@ -1435,96 +1445,98 @@ void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p, __sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1); } -static u64 dsq_first_runnable_for_ms(u64 dsq_id, u64 now) -{ - struct task_struct *p; - - if (dsq_id > LO_FALLBACK_DSQ) - return 0; - - bpf_for_each(scx_dsq, p, dsq_id, 0) { - struct task_ctx *tctx; - - if ((tctx = lookup_task_ctx(p))) - return (now - tctx->runnable_at) / 1000000; - } - - return 0; -} - -static void dump_layer_cpumask(int idx) -{ - struct cpumask *layer_cpumask; - s32 cpu; - char buf[128] = "", *p; - - if (!(layer_cpumask = lookup_layer_cpumask(idx))) - return; - - bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { - if (!(p = MEMBER_VPTR(buf, [idx++]))) - break; - if (bpf_cpumask_test_cpu(cpu, layer_cpumask)) - *p++ = '0' + cpu % 10; - else - *p++ = '.'; - - if ((cpu & 7) == 7) { - if (!(p = MEMBER_VPTR(buf, [idx++]))) - break; - *p++ = '|'; - } - } - buf[sizeof(buf) - 1] = '\0'; - - scx_bpf_dump("%s", buf); -} - -void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx) -{ - u64 now = bpf_ktime_get_ns(); - int i, j, idx; - struct layer *layer; - - bpf_for(i, 0, nr_layers) { - layer = lookup_layer(i); - if (!layer) { - scx_bpf_error("unabled to lookup layer %d", i); - continue; - } - - if (disable_topology) { - scx_bpf_dump("LAYER[%d] nr_cpus=%u nr_queued=%d -%llums cpus=", - i, layers[i].nr_cpus, scx_bpf_dsq_nr_queued(i), - dsq_first_runnable_for_ms(i, now)); - } else { - bpf_for(j, 0, nr_llcs) { - if (!(layer->cache_mask & (1 << j))) - continue; - - idx = layer_dsq_id(layer->idx, j); - scx_bpf_dump("LAYER[%d]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=", - i, idx, layers[i].nr_cpus, scx_bpf_dsq_nr_queued(idx), - dsq_first_runnable_for_ms(idx, now)); - } - } - dump_layer_cpumask(i); - scx_bpf_dump("\n"); - } - - scx_bpf_dump("HI_FALLBACK nr_queued=%d -%llums\n", - scx_bpf_dsq_nr_queued(HI_FALLBACK_DSQ), - dsq_first_runnable_for_ms(HI_FALLBACK_DSQ, now)); - scx_bpf_dump("LO_FALLBACK nr_queued=%d -%llums\n", - scx_bpf_dsq_nr_queued(LO_FALLBACK_DSQ), - dsq_first_runnable_for_ms(LO_FALLBACK_DSQ, now)); -} +// static u64 dsq_first_runnable_for_ms(u64 dsq_id, u64 now) +// { +// struct task_struct *p; +// +// if (dsq_id > LO_FALLBACK_DSQ) +// return 0; +// +// bpf_for_each(scx_dsq, p, dsq_id, 0) { +// struct task_ctx *tctx; +// +// if ((tctx = lookup_task_ctx(p))) +// return (now - tctx->runnable_at) / 1000000; +// } +// +// return 0; +// } + +// static void dump_layer_cpumask(int idx) +// { +// struct cpumask *layer_cpumask; +// s32 cpu; +// char buf[128] = "", *p; +// +// if (!(layer_cpumask = lookup_layer_cpumask(idx))) +// return; +// +// bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { +// if (!(p = MEMBER_VPTR(buf, [idx++]))) +// break; +// if (bpf_cpumask_test_cpu(cpu, layer_cpumask)) +// *p++ = '0' + cpu % 10; +// else +// *p++ = '.'; +// +// if ((cpu & 7) == 7) { +// if (!(p = MEMBER_VPTR(buf, [idx++]))) +// break; +// *p++ = '|'; +// } +// } +// buf[sizeof(buf) - 1] = '\0'; +// +// scx_bpf_dump("%s", buf); +// } +// +// void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx) +// { +// u64 now = bpf_ktime_get_ns(); +// int i, j, idx; +// struct layer *layer; +// +// bpf_for(i, 0, nr_layers) { +// layer = lookup_layer(i); +// if (!layer) { +// __COMPAT_scx_bpf_error("unabled to lookup layer %d", i); +// continue; +// } +// +// if (disable_topology) { +// scx_bpf_dump("LAYER[%d] nr_cpus=%u nr_queued=%d -%llums cpus=", +// i, layers[i].nr_cpus, scx_bpf_dsq_nr_queued(i), +// dsq_first_runnable_for_ms(i, now)); +// } else { +// bpf_for(j, 0, nr_llcs) { +// if (!(layer->cache_mask & (1 << j))) +// continue; +// +// idx = layer_dsq_id(layer->idx, j); +// scx_bpf_dump("LAYER[%d]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=", +// i, idx, layers[i].nr_cpus, scx_bpf_dsq_nr_queued(idx), +// dsq_first_runnable_for_ms(idx, now)); +// } +// } +// dump_layer_cpumask(i); +// scx_bpf_dump("\n"); +// } +// +// scx_bpf_dump("HI_FALLBACK nr_queued=%d -%llums\n", +// scx_bpf_dsq_nr_queued(HI_FALLBACK_DSQ), +// dsq_first_runnable_for_ms(HI_FALLBACK_DSQ, now)); +// scx_bpf_dump("LO_FALLBACK nr_queued=%d -%llums\n", +// scx_bpf_dsq_nr_queued(LO_FALLBACK_DSQ), +// dsq_first_runnable_for_ms(LO_FALLBACK_DSQ, now)); +// } s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) { struct bpf_cpumask *cpumask; int i, j, k, nr_online_cpus, ret; + __COMPAT_scx_bpf_switch_all(); + ret = scx_bpf_create_dsq(HI_FALLBACK_DSQ, -1); if (ret < 0) return ret; @@ -1572,19 +1584,19 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) layer->exclusive); if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) { - scx_bpf_error("too many ORs"); + __COMPAT_scx_bpf_error("too many ORs"); return -EINVAL; } bpf_for(j, 0, layer->nr_match_ors) { struct layer_match_ands *ands = MEMBER_VPTR(layers, [i].matches[j]); if (!ands) { - scx_bpf_error("shouldn't happen"); + __COMPAT_scx_bpf_error("shouldn't happen"); return -EINVAL; } if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) { - scx_bpf_error("too many ANDs"); + __COMPAT_scx_bpf_error("too many ANDs"); return -EINVAL; } @@ -1600,7 +1612,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) match = MEMBER_VPTR(layers, [i].matches[j].matches[k]); if (!match) { - scx_bpf_error("shouldn't happen"); + __COMPAT_scx_bpf_error("shouldn't happen"); return -EINVAL; } @@ -1639,7 +1651,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) dbg("%s TGID %u", header, match->tgid); break; default: - scx_bpf_error("%s Invalid kind", header); + __COMPAT_scx_bpf_error("%s Invalid kind", header); return -EINVAL; } } @@ -1695,7 +1707,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei) { - UEI_RECORD(uei, ei); + uei_record(&uei, ei); } SCX_OPS_DEFINE(layered, @@ -1710,9 +1722,11 @@ SCX_OPS_DEFINE(layered, .set_weight = (void *)layered_set_weight, .set_cpumask = (void *)layered_set_cpumask, .cpu_release = (void *)layered_cpu_release, - .init_task = (void *)layered_init_task, - .exit_task = (void *)layered_exit_task, - .dump = (void *)layered_dump, + .prep_enable = (void *)layered_prep_enable, + .disable = (void *)layered_disable, + // .dump = (void *)layered_dump, .init = (void *)layered_init, .exit = (void *)layered_exit, + .flags = SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_LAST, .name = "layered"); + diff --git a/scheds/rust/scx_layered/src/bpf/util.bpf.h b/scheds/rust/scx_layered/src/bpf/util.bpf.h new file mode 120000 index 000000000..ee7b16c86 --- /dev/null +++ b/scheds/rust/scx_layered/src/bpf/util.bpf.h @@ -0,0 +1 @@ +util.bpf.c \ No newline at end of file diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index 0fe447df4..639414948 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -6,6 +6,11 @@ mod bpf_skel; mod stats; pub use bpf_skel::*; pub mod bpf_intf; +use core::ffi::CStr; +use stats::LayerStats; +use stats::StatsReq; +use stats::StatsRes; +use stats::SysStats; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; @@ -43,20 +48,10 @@ use scx_stats::prelude::*; use scx_utils::compat; use scx_utils::init_libbpf_logging; use scx_utils::ravg::ravg_read; -use scx_utils::scx_ops_attach; -use scx_utils::scx_ops_load; -use scx_utils::scx_ops_open; -use scx_utils::uei_exited; -use scx_utils::uei_report; use scx_utils::Cache; use scx_utils::Topology; -use scx_utils::UserExitInfo; use serde::Deserialize; use serde::Serialize; -use stats::LayerStats; -use stats::StatsReq; -use stats::StatsRes; -use stats::SysStats; const RAVG_FRAC_BITS: u32 = bpf_intf::ravg_consts_RAVG_FRAC_BITS; const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize; @@ -420,6 +415,12 @@ struct Opts { #[clap(long)] monitor: Option, + /// DEPRECATED: Enable output of stats in OpenMetrics format instead of via + /// log macros. This option is useful if you want to collect stats in some + /// monitoring database like prometheseus. + #[clap(short = 'o', long)] + open_metrics_format: bool, + /// Run with example layer specifications (useful for e.g. CI pipelines) #[clap(long)] run_example: bool, @@ -888,6 +889,66 @@ impl Stats { } } +#[derive(Debug, Default)] +struct UserExitInfo { + kind: i32, + reason: Option, + msg: Option, +} + +impl UserExitInfo { + fn read(bpf_uei: &types::user_exit_info) -> Result { + let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) }; + + let (reason, msg) = if kind != 0 { + ( + Some( + unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) } + .to_str() + .context("Failed to convert reason to string")? + .to_string(), + ) + .filter(|s| !s.is_empty()), + Some( + unsafe { CStr::from_ptr(bpf_uei.msg.as_ptr() as *const _) } + .to_str() + .context("Failed to convert msg to string")? + .to_string(), + ) + .filter(|s| !s.is_empty()), + ) + } else { + (None, None) + }; + + Ok(Self { kind, reason, msg }) + } + + fn exited(bpf_uei: &types::user_exit_info) -> Result { + Ok(Self::read(bpf_uei)?.kind != 0) + } + + fn report(&self) -> Result<()> { + let why = match (&self.reason, &self.msg) { + (Some(reason), None) => format!("{}", reason), + (Some(reason), Some(msg)) => format!("{} ({})", reason, msg), + _ => "".into(), + }; + + match self.kind { + 0 => Ok(()), + etype => { + if etype != 64 { + bail!("EXIT: kind={} {}", etype, why); + } else { + info!("EXIT: {}", why); + Ok(()) + } + } + } + } +} + #[derive(Debug)] struct CpuPool { nr_cores: usize, @@ -1091,12 +1152,7 @@ impl CpuPool { } } -fn layer_core_order( - spec: &LayerSpec, - growth_algo: LayerGrowthAlgo, - layer_idx: usize, - topo: &Topology, -) -> Vec { +fn layer_core_order(growth_algo: LayerGrowthAlgo, layer_idx: usize, topo: &Topology) -> Vec { let mut core_order = vec![]; match growth_algo { LayerGrowthAlgo::Sticky => { @@ -1146,20 +1202,27 @@ struct Layer { } impl Layer { - fn new(spec: &LayerSpec, idx: usize, cpu_pool: &CpuPool, topo: &Topology) -> Result { - let name = &spec.name; - let kind = spec.kind.clone(); + fn new( + idx: usize, + cpu_pool: &CpuPool, + name: &str, + kind: LayerKind, + topo: &Topology, + ) -> Result { let mut cpus = bitvec![0; cpu_pool.nr_cpus]; cpus.fill(false); let mut allowed_cpus = bitvec![0; cpu_pool.nr_cpus]; + let mut layer_growth_algo = LayerGrowthAlgo::Sticky; match &kind { LayerKind::Confined { cpus_range, util_range, nodes, llcs, + growth_algo, .. } => { + layer_growth_algo = growth_algo.clone(); let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX)); if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 { bail!("invalid cpus_range {:?}", cpus_range); @@ -1195,7 +1258,19 @@ impl Layer { bail!("invalid util_range {:?}", util_range); } } - LayerKind::Grouped { nodes, llcs, .. } | LayerKind::Open { nodes, llcs, .. } => { + LayerKind::Grouped { + growth_algo, + nodes, + llcs, + .. + } + | LayerKind::Open { + growth_algo, + nodes, + llcs, + .. + } => { + layer_growth_algo = growth_algo.clone(); if nodes.len() == 0 && llcs.len() == 0 { allowed_cpus.fill(true); } else { @@ -1220,13 +1295,7 @@ impl Layer { } } - let layer_growth_algo = match &kind { - LayerKind::Confined { growth_algo, .. } - | LayerKind::Grouped { growth_algo, .. } - | LayerKind::Open { growth_algo, .. } => growth_algo.clone(), - }; - - let core_order = layer_core_order(spec, layer_growth_algo, idx, topo); + let core_order = layer_core_order(layer_growth_algo, idx, topo); Ok(Self { name: name.into(), @@ -1273,7 +1342,8 @@ impl Layer { { trace!( "layer-{} needs more CPUs (util={:.3}) but is over the load fraction", - &self.name, layer_util + &self.name, + layer_util ); return Ok(false); } @@ -1591,7 +1661,7 @@ impl<'a, 'b> Scheduler<'a, 'b> { skel.maps.rodata_data.nr_llcs = 0; for node in topo.nodes() { - debug!( + info!( "configuring node {}, LLCs {:?}", node.id(), node.llcs().len() @@ -1599,7 +1669,7 @@ impl<'a, 'b> Scheduler<'a, 'b> { skel.maps.rodata_data.nr_llcs += node.llcs().len() as u32; for (_, llc) in node.llcs() { - debug!("configuring llc {:?} for node {:?}", llc.id(), node.id()); + info!("configuring llc {:?} for node {:?}", llc.id(), node.id()); skel.maps.rodata_data.llc_numa_id_map[llc.id()] = node.id() as u32; } } @@ -1622,7 +1692,9 @@ impl<'a, 'b> Scheduler<'a, 'b> { let mut skel_builder = BpfSkelBuilder::default(); skel_builder.obj_builder.debug(opts.verbose > 1); init_libbpf_logging(None); - let mut skel = scx_ops_open!(skel_builder, open_object, layered)?; + let mut skel = skel_builder + .open(open_object) + .context("failed to open BPF program")?; // scheduler_tick() got renamed to sched_tick() during v6.10-rc. let sched_tick_name = match compat::ksym_exists("sched_tick")? { @@ -1636,7 +1708,7 @@ impl<'a, 'b> Scheduler<'a, 'b> { .context("Failed to set attach target for sched_tick_fentry()")?; // Initialize skel according to @opts. - skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len; + // skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len; skel.maps.rodata_data.debug = opts.verbose as u32; skel.maps.rodata_data.slice_ns = opts.slice_us * 1000; @@ -1657,11 +1729,17 @@ impl<'a, 'b> Scheduler<'a, 'b> { Self::init_layers(&mut skel, opts, layer_specs, &topo)?; Self::init_nodes(&mut skel, opts, &topo); - let mut skel = scx_ops_load!(skel, layered, uei)?; + let mut skel = skel.load().context("Failed to load BPF program")?; let mut layers = vec![]; for (idx, spec) in layer_specs.iter().enumerate() { - layers.push(Layer::new(&spec, idx, &cpu_pool, &topo)?); + layers.push(Layer::new( + idx, + &cpu_pool, + &spec.name, + spec.kind.clone(), + &topo, + )?); } // Other stuff. @@ -1674,11 +1752,10 @@ impl<'a, 'b> Scheduler<'a, 'b> { // huge problem in the interim until we figure it out. // Attach. - let struct_ops = scx_ops_attach!(skel, layered)?; let stats_server = StatsServer::new(stats::server_data()).launch()?; - let sched = Self { - struct_ops: Some(struct_ops), + let mut sched = Self { + struct_ops: None, layer_specs, sched_intv: Duration::from_secs_f64(opts.interval), @@ -1698,6 +1775,20 @@ impl<'a, 'b> Scheduler<'a, 'b> { stats_server, }; + sched + .skel + .attach() + .context("Failed to attach BPF program")?; + + sched.struct_ops = Some( + sched + .skel + .maps + .layered + .attach_struct_ops() + .context("Failed to attach layered struct ops")?, + ); + info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics."); Ok(sched) @@ -1821,12 +1912,14 @@ impl<'a, 'b> Scheduler<'a, 'b> { Ok(sys_stats) } - fn run(&mut self, shutdown: Arc) -> Result { + fn run(&mut self, shutdown: Arc) -> Result<()> { let (res_ch, req_ch) = self.stats_server.channels(); let mut next_sched_at = Instant::now() + self.sched_intv; let mut cpus_ranges = HashMap::>::new(); - while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) { + while !shutdown.load(Ordering::Relaxed) + && !UserExitInfo::exited(&self.skel.maps.bss_data.uei)? + { let now = Instant::now(); if now >= next_sched_at { @@ -1873,7 +1966,7 @@ impl<'a, 'b> Scheduler<'a, 'b> { } self.struct_ops.take(); - uei_report!(&self.skel, uei) + UserExitInfo::read(&self.skel.maps.bss_data.uei)?.report() } } @@ -2051,6 +2144,10 @@ fn main() -> Result<()> { ); } + if opts.open_metrics_format { + warn!("open_metrics_format is deprecated"); + } + debug!("specs={}", serde_json::to_string_pretty(&layer_config)?); verify_layer_specs(&layer_config.specs)?; @@ -2072,12 +2169,6 @@ fn main() -> Result<()> { } let mut open_object = MaybeUninit::uninit(); - loop { - let mut sched = Scheduler::init(&opts, &layer_config.specs, &mut open_object)?; - if !sched.run(shutdown.clone())?.should_restart() { - break; - } - } - - Ok(()) + let mut sched = Scheduler::init(&opts, &layer_config.specs, &mut open_object)?; + sched.run(shutdown.clone()) } diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs index 5a70fcfbb..c293a3309 100644 --- a/scheds/rust/scx_layered/src/stats.rs +++ b/scheds/rust/scx_layered/src/stats.rs @@ -150,7 +150,11 @@ impl LayerStats { } }; let calc_frac = |a, b| { - if b != 0.0 { a / b * 100.0 } else { 0.0 } + if b != 0.0 { + a / b * 100.0 + } else { + 0.0 + } }; let is_excl = match &layer.kind { diff --git a/scheds/rust/scx_rusty/src/bpf/main.bpf.c b/scheds/rust/scx_rusty/src/bpf/main.bpf.c index 4764b1d25..7f51fb542 100644 --- a/scheds/rust/scx_rusty/src/bpf/main.bpf.c +++ b/scheds/rust/scx_rusty/src/bpf/main.bpf.c @@ -48,7 +48,7 @@ char _license[] SEC("license") = "GPL"; -UEI_DEFINE(uei); + /* * const volatiles are set during initialization and treated as consts by the @@ -78,6 +78,13 @@ const volatile u32 debug; /* base slice duration */ static u64 slice_ns = SCX_SLICE_DFL; +#define __COMPAT_scx_bpf_error(fmt, args...) \ + do { \ + bpf_printk(fmt, ##args); \ + } while (0) + +struct user_exit_info uei; + /* * Per-CPU context */ @@ -172,7 +179,7 @@ static struct dom_ctx *lookup_dom_ctx(u32 dom_id) domc = try_lookup_dom_ctx(dom_id); if (!domc) - scx_bpf_error("Failed to lookup dom[%u]", dom_id); + __COMPAT_scx_bpf_error("Failed to lookup dom[%u]", dom_id); return domc; } @@ -190,7 +197,7 @@ static struct task_ctx *lookup_task_ctx(struct task_struct *p) taskc = try_lookup_task_ctx(p); if (!taskc) - scx_bpf_error("task_ctx lookup failed for pid %d", p->pid); + __COMPAT_scx_bpf_error("task_ctx lookup failed for pid %d", p->pid); return taskc; } @@ -201,7 +208,7 @@ static struct pcpu_ctx *lookup_pcpu_ctx(s32 cpu) pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]); if (!pcpuc) - scx_bpf_error("Failed to lookup pcpu ctx for %d", cpu); + __COMPAT_scx_bpf_error("Failed to lookup pcpu ctx for %d", cpu); return pcpuc; } @@ -230,7 +237,7 @@ static struct bucket_ctx *lookup_dom_bucket(struct dom_ctx *dom_ctx, if (bucket) return bucket; - scx_bpf_error("Failed to lookup dom bucket"); + __COMPAT_scx_bpf_error("Failed to lookup dom bucket"); return NULL; } @@ -243,7 +250,7 @@ static struct lock_wrapper *lookup_dom_bkt_lock(u32 dom_id, u32 weight) if (lockw) return lockw; - scx_bpf_error("Failed to lookup dom lock"); + __COMPAT_scx_bpf_error("Failed to lookup dom lock"); return NULL; } @@ -254,7 +261,7 @@ static struct lock_wrapper *lookup_dom_vtime_lock(u32 dom_id) lockw = bpf_map_lookup_elem(&dom_vtime_locks, &idx); if (!lockw) - scx_bpf_error("Failed to lookup dom lock"); + __COMPAT_scx_bpf_error("Failed to lookup dom lock"); return lockw; } @@ -297,7 +304,7 @@ static void dom_dcycle_adj(u32 dom_id, u32 weight, u64 now, bool runnable) bpf_spin_unlock(&lockw->lock); if (adj < 0 && (s64)bucket->dcycle < 0) - scx_bpf_error("cpu%d dom%u bucket%u load underflow (dcycle=%lld adj=%lld)", + __COMPAT_scx_bpf_error("cpu%d dom%u bucket%u load underflow (dcycle=%lld adj=%lld)", bpf_get_smp_processor_id(), dom_id, bucket_idx, bucket->dcycle, adj); @@ -397,7 +404,7 @@ int dom_xfer_task(pid_t pid, u32 new_dom_id, u64 now) p = bpf_task_from_pid(pid); if (!p) { - scx_bpf_error("Failed to lookup task %d", pid); + __COMPAT_scx_bpf_error("Failed to lookup task %d", pid); return 0; } @@ -563,7 +570,7 @@ const int sched_prio_to_weight[DL_MAX_LAT_PRIO + 1] = { static u64 sched_prio_to_latency_weight(u64 prio) { if (prio >= DL_MAX_LAT_PRIO) { - scx_bpf_error("Invalid prio index"); + __COMPAT_scx_bpf_error("Invalid prio index"); return 0; } @@ -751,7 +758,7 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p, t_cpumask = taskc->cpumask; if (!t_cpumask) { - scx_bpf_error("Failed to look up task cpumask"); + __COMPAT_scx_bpf_error("Failed to look up task cpumask"); return false; } @@ -770,7 +777,7 @@ static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p, d_cpumask = new_domc->cpumask; if (!d_cpumask) { - scx_bpf_error("Failed to get dom%u cpumask kptr", + __COMPAT_scx_bpf_error("Failed to get dom%u cpumask kptr", new_dom_id); return false; } @@ -820,7 +827,7 @@ static s32 try_sync_wakeup(struct task_struct *p, struct task_ctx *taskc, d_cpumask = domc->cpumask; if (!d_cpumask) { - scx_bpf_error("Failed to acquire dom%u cpumask kptr", + __COMPAT_scx_bpf_error("Failed to acquire dom%u cpumask kptr", taskc->dom_id); return -ENOENT; } @@ -970,7 +977,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, tmp_direct_greedy = direct_greedy_cpumask; if (!tmp_direct_greedy) { - scx_bpf_error("Failed to lookup direct_greedy mask"); + __COMPAT_scx_bpf_error("Failed to lookup direct_greedy mask"); goto enoent; } /* @@ -984,13 +991,13 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, if (!direct_greedy_numa) { node_mask = domc->node_cpumask; if (!node_mask) { - scx_bpf_error("Failed to lookup node mask"); + __COMPAT_scx_bpf_error("Failed to lookup node mask"); goto enoent; } tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, NULL); if (!tmp_cpumask) { - scx_bpf_error("Failed to lookup tmp cpumask"); + __COMPAT_scx_bpf_error("Failed to lookup tmp cpumask"); goto enoent; } bpf_cpumask_and(tmp_cpumask, @@ -1097,7 +1104,7 @@ void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags) if (!(taskc = lookup_task_ctx(p))) return; if (!(p_cpumask = taskc->cpumask)) { - scx_bpf_error("NULL cpumask"); + __COMPAT_scx_bpf_error("NULL cpumask"); return; } @@ -1188,7 +1195,7 @@ u32 dom_node_id(u32 dom_id) nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]); if (!nid_ptr) { - scx_bpf_error("Couldn't look up node ID for %d", dom_id); + __COMPAT_scx_bpf_error("Couldn't look up node ID for %d", dom_id); return 0; } return *nid_ptr; @@ -1391,7 +1398,7 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p) dom_id = taskc->dom_id; if (dom_id >= MAX_DOMS) { - scx_bpf_error("Invalid dom ID"); + __COMPAT_scx_bpf_error("Invalid dom ID"); return; } @@ -1408,7 +1415,7 @@ void BPF_STRUCT_OPS(rusty_running, struct task_struct *p) pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]); if (!pidp) { - scx_bpf_error("dom_active_pids[%u][%llu] indexing failed", + __COMPAT_scx_bpf_error("dom_active_pids[%u][%llu] indexing failed", dom_id, idx); return; } @@ -1550,7 +1557,7 @@ static void task_pick_and_set_domain(struct task_ctx *taskc, dom_id = task_pick_domain(taskc, p, cpumask); if (!task_set_domain(taskc, p, dom_id, init_dsq_vtime)) - scx_bpf_error("Failed to set dom%d for %s[%d]", + __COMPAT_scx_bpf_error("Failed to set dom%d for %s[%d]", dom_id, p->comm, p->pid); } @@ -1574,21 +1581,21 @@ static s32 create_save_cpumask(struct bpf_cpumask **kptr) cpumask = bpf_cpumask_create(); if (!cpumask) { - scx_bpf_error("Failed to create cpumask"); + __COMPAT_scx_bpf_error("Failed to create cpumask"); return -ENOMEM; } cpumask = bpf_kptr_xchg(kptr, cpumask); if (cpumask) { - scx_bpf_error("kptr already had cpumask"); + __COMPAT_scx_bpf_error("kptr already had cpumask"); bpf_cpumask_release(cpumask); } return 0; } -s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p, - struct scx_init_task_args *args) +s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, + struct scx_enable_args *args) { u64 now = bpf_ktime_get_ns(); struct task_ctx taskc = { @@ -1643,25 +1650,25 @@ s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p, return 0; } - -void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p, - struct scx_exit_task_args *args) +/* +void BPF_STRUCT_OPS(rusty_cancel_enable, struct task_struct *p, + struct scx_enable_args *args) { pid_t pid = p->pid; long ret; - /* - * XXX - There's no reason delete should fail here but BPF's recursion - * protection can unnecessarily fail the operation. The fact that - * deletions aren't reliable means that we sometimes leak task_ctx and - * can't use BPF_NOEXIST on allocation in .prep_enable(). - */ + // XXX - There's no reason delete should fail here but BPF's recursion + // protection can unnecessarily fail the operation. The fact that + // deletions aren't reliable means that we sometimes leak task_ctx and + // can't use BPF_NOEXIST on allocation in .prep_enable(). + ret = bpf_map_delete_elem(&task_data, &pid); if (ret) { stat_add(RUSTY_STAT_TASK_GET_ERR, 1); return; } } +*/ static s32 create_node(u32 node_id) { @@ -1673,7 +1680,7 @@ static s32 create_node(u32 node_id) nodec = bpf_map_lookup_elem(&node_data, &node_id); if (!nodec) { /* Should never happen, it's created statically at load time. */ - scx_bpf_error("No node%u", node_id); + __COMPAT_scx_bpf_error("No node%u", node_id); return -ENOENT; } @@ -1685,7 +1692,7 @@ static s32 create_node(u32 node_id) cpumask = nodec->cpumask; if (!cpumask) { bpf_rcu_read_unlock(); - scx_bpf_error("Failed to lookup node cpumask"); + __COMPAT_scx_bpf_error("Failed to lookup node cpumask"); return -ENOENT; } @@ -1694,7 +1701,7 @@ static s32 create_node(u32 node_id) nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]); if (!nmask) { - scx_bpf_error("array index error"); + __COMPAT_scx_bpf_error("array index error"); ret = -ENOENT; break; } @@ -1716,7 +1723,7 @@ static s32 create_dom(u32 dom_id) s32 ret; if (dom_id >= MAX_DOMS) { - scx_bpf_error("Max dom ID %u exceeded (%u)", MAX_DOMS, dom_id); + __COMPAT_scx_bpf_error("Max dom ID %u exceeded (%u)", MAX_DOMS, dom_id); return -EINVAL; } @@ -1724,7 +1731,7 @@ static s32 create_dom(u32 dom_id) ret = scx_bpf_create_dsq(dom_id, node_id); if (ret < 0) { - scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret); + __COMPAT_scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret); return ret; } @@ -1743,7 +1750,7 @@ static s32 create_dom(u32 dom_id) all_mask = all_cpumask; if (!dom_mask || !all_mask) { bpf_rcu_read_unlock(); - scx_bpf_error("Could not find cpumask"); + __COMPAT_scx_bpf_error("Could not find cpumask"); return -ENOENT; } @@ -1752,7 +1759,7 @@ static s32 create_dom(u32 dom_id) dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]); if (!dmask) { - scx_bpf_error("array index error"); + __COMPAT_scx_bpf_error("array index error"); ret = -ENOENT; break; } @@ -1773,7 +1780,7 @@ static s32 create_dom(u32 dom_id) nodec = bpf_map_lookup_elem(&node_data, &node_id); if (!nodec) { /* Should never happen, it's created statically at load time. */ - scx_bpf_error("No node%u", node_id); + __COMPAT_scx_bpf_error("No node%u", node_id); return -ENOENT; } ret = create_save_cpumask(&domc->node_cpumask); @@ -1785,7 +1792,7 @@ static s32 create_dom(u32 dom_id) dom_mask = domc->node_cpumask; if (!node_mask || !dom_mask) { bpf_rcu_read_unlock(); - scx_bpf_error("cpumask lookup failed"); + __COMPAT_scx_bpf_error("cpumask lookup failed"); return -ENOENT; } bpf_cpumask_copy(dom_mask, (const struct cpumask *)node_mask); @@ -1816,7 +1823,7 @@ static s32 initialize_cpu(s32 cpu) cpumask = domc->cpumask; if (!cpumask) { bpf_rcu_read_unlock(); - scx_bpf_error("Failed to lookup dom node cpumask"); + __COMPAT_scx_bpf_error("Failed to lookup dom node cpumask"); return -ENOENT; } @@ -1834,6 +1841,8 @@ static s32 initialize_cpu(s32 cpu) s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init) { s32 i, ret; + + __COMPAT_scx_bpf_switch_all(); ret = create_save_cpumask(&all_cpumask); if (ret) @@ -1872,7 +1881,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init) void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei) { - UEI_RECORD(uei, ei); + uei_record(&uei, ei); } SCX_OPS_DEFINE(rusty, @@ -1885,8 +1894,8 @@ SCX_OPS_DEFINE(rusty, .quiescent = (void *)rusty_quiescent, .set_weight = (void *)rusty_set_weight, .set_cpumask = (void *)rusty_set_cpumask, - .init_task = (void *)rusty_init_task, - .exit_task = (void *)rusty_exit_task, + .prep_enable = (void *)rusty_prep_enable, + /* .cancel_enable = (void *)rusty_cancel_enable, */ .init = (void *)rusty_init, .exit = (void *)rusty_exit, .timeout_ms = 10000, diff --git a/scheds/rust/scx_rusty/src/main.rs b/scheds/rust/scx_rusty/src/main.rs index d9fd0915f..d520a022b 100644 --- a/scheds/rust/scx_rusty/src/main.rs +++ b/scheds/rust/scx_rusty/src/main.rs @@ -16,7 +16,9 @@ pub mod load_balance; use load_balance::LoadBalancer; mod stats; +use core::ffi::CStr; use std::collections::BTreeMap; +use std::ffi::CString; use std::mem::MaybeUninit; use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; @@ -52,11 +54,10 @@ use scx_utils::init_libbpf_logging; use scx_utils::scx_ops_attach; use scx_utils::scx_ops_load; use scx_utils::scx_ops_open; -use scx_utils::uei_exited; -use scx_utils::uei_report; +// use scx_utils::uei_exited; +// use scx_utils::uei_report; use scx_utils::Cpumask; use scx_utils::Topology; -use scx_utils::UserExitInfo; use scx_utils::NR_CPU_IDS; const MAX_DOMS: usize = bpf_intf::consts_MAX_DOMS as usize; @@ -432,7 +433,7 @@ impl<'a> Scheduler<'a> { if opts.partial { skel.struct_ops.rusty_mut().flags |= *compat::SCX_OPS_SWITCH_PARTIAL; } - skel.struct_ops.rusty_mut().exit_dump_len = opts.exit_dump_len; + // skel.struct_ops.rusty_mut().exit_dump_len = opts.exit_dump_len; skel.maps.rodata_data.load_half_life = (opts.load_half_life * 1000000000.0) as u32; skel.maps.rodata_data.kthreads_local = opts.kthreads_local; @@ -566,7 +567,7 @@ impl<'a> Scheduler<'a> { Ok(()) } - fn run(&mut self, shutdown: Arc) -> Result { + fn run(&mut self, shutdown: Arc) -> Result<()> { let (res_ch, req_ch) = self.stats_server.channels(); let now = Instant::now(); let mut next_tune_at = now + self.tune_interval; @@ -574,7 +575,9 @@ impl<'a> Scheduler<'a> { self.skel.maps.stats.value_size() as usize; - while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) { + while !shutdown.load(Ordering::Relaxed) + && !UserExitInfo::exited(&self.skel.maps.bss_data.uei)? + { let now = Instant::now(); if now >= next_tune_at { @@ -608,7 +611,7 @@ impl<'a> Scheduler<'a> { } self.struct_ops.take(); - uei_report!(&self.skel, uei) + UserExitInfo::read(&self.skel.maps.bss_data.uei)?.report() } } @@ -669,11 +672,66 @@ fn main() -> Result<()> { } let mut open_object = MaybeUninit::uninit(); - loop { - let mut sched = Scheduler::init(&opts, &mut open_object)?; - if !sched.run(shutdown.clone())?.should_restart() { - break; + let mut sched = Scheduler::init(&opts, &mut open_object)?; + sched.run(shutdown.clone()) +} + +#[derive(Debug, Default)] +struct UserExitInfo { + kind: i32, + reason: Option, + msg: Option, +} + +impl UserExitInfo { + fn read(bpf_uei: &types::user_exit_info) -> Result { + let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) }; + + let (reason, msg) = if kind != 0 { + ( + Some( + unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) } + .to_str() + .context("Failed to convert reason to string")? + .to_string(), + ) + .filter(|s| !s.is_empty()), + Some( + unsafe { CStr::from_ptr(bpf_uei.msg.as_ptr() as *const _) } + .to_str() + .context("Failed to convert msg to string")? + .to_string(), + ) + .filter(|s| !s.is_empty()), + ) + } else { + (None, None) + }; + + Ok(Self { kind, reason, msg }) + } + + fn exited(bpf_uei: &types::user_exit_info) -> Result { + Ok(Self::read(bpf_uei)?.kind != 0) + } + + fn report(&self) -> Result<()> { + let why = match (&self.reason, &self.msg) { + (Some(reason), None) => format!("{}", reason), + (Some(reason), Some(msg)) => format!("{} ({})", reason, msg), + _ => "".into(), + }; + + match self.kind { + 0 => Ok(()), + etype => { + if etype != 64 { + bail!("EXIT: kind={} {}", etype, why); + } else { + info!("EXIT: {}", why); + Ok(()) + } + } } } - Ok(()) }