diff --git a/docs/spec-conformance.md b/docs/spec-conformance.md index 5c04210a7bf..a278d76a740 100644 --- a/docs/spec-conformance.md +++ b/docs/spec-conformance.md @@ -10,7 +10,6 @@ Spec version | Feature | PR v1.0.2 | `.linux.personality` | [#3126](https://github.com/opencontainers/runc/pull/3126) v1.1.0 | `SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV` | [#3862](https://github.com/opencontainers/runc/pull/3862) v1.1.0 | rsvd hugetlb cgroup | TODO ([#3859](https://github.com/opencontainers/runc/issues/3859)) -v1.1.0 | `.process.scheduler` | TODO ([#3895](https://github.com/opencontainers/runc/issues/3895)) v1.1.0 | `.process.ioPriority` | [#3783](https://github.com/opencontainers/runc/pull/3783) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 19541293b6f..1ece49c3732 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -8,6 +8,7 @@ import ( "time" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runtime-spec/specs-go" @@ -219,6 +220,68 @@ type Config struct { // TimeOffsets specifies the offset for supporting time namespaces. TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"` + + // Scheduler represents the scheduling attributes for a process. + Scheduler *Scheduler `json:"scheduler,omitempty"` +} + +// Scheduler is based on the Linux sched_setattr(2) syscall. +type Scheduler = specs.Scheduler + +// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr. +func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) { + var policy uint32 + switch scheduler.Policy { + case specs.SchedOther: + policy = 0 + case specs.SchedFIFO: + policy = 1 + case specs.SchedRR: + policy = 2 + case specs.SchedBatch: + policy = 3 + case specs.SchedISO: + policy = 4 + case specs.SchedIdle: + policy = 5 + case specs.SchedDeadline: + policy = 6 + default: + return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy) + } + + var flags uint64 + for _, flag := range scheduler.Flags { + switch flag { + case specs.SchedFlagResetOnFork: + flags |= 0x01 + case specs.SchedFlagReclaim: + flags |= 0x02 + case specs.SchedFlagDLOverrun: + flags |= 0x04 + case specs.SchedFlagKeepPolicy: + flags |= 0x08 + case specs.SchedFlagKeepParams: + flags |= 0x10 + case specs.SchedFlagUtilClampMin: + flags |= 0x20 + case specs.SchedFlagUtilClampMax: + flags |= 0x40 + default: + return nil, fmt.Errorf("invalid scheduler flag: %s", flag) + } + } + + return &unix.SchedAttr{ + Size: unix.SizeofSchedAttr, + Policy: policy, + Flags: flags, + Nice: scheduler.Nice, + Priority: uint32(scheduler.Priority), + Runtime: scheduler.Runtime, + Deadline: scheduler.Deadline, + Period: scheduler.Period, + }, nil } type ( diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 11b80ddaae6..a6736f26584 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -11,6 +11,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runtime-spec/specs-go" selinux "github.com/opencontainers/selinux/go-selinux" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -30,6 +31,7 @@ func Validate(config *configs.Config) error { intelrdtCheck, rootlessEUIDCheck, mountsStrict, + scheduler, } for _, c := range checks { if err := c(config); err != nil { @@ -353,3 +355,24 @@ func isHostNetNS(path string) (bool, error) { return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil } + +// scheduler is to validate scheduler configs according to https://man7.org/linux/man-pages/man2/sched_setattr.2.html +func scheduler(config *configs.Config) error { + s := config.Scheduler + if s == nil { + return nil + } + if s.Policy == "" { + return errors.New("scheduler policy is required") + } + if s.Nice < -20 || s.Nice > 19 { + return fmt.Errorf("invalid scheduler.nice: %d", s.Nice) + } + if s.Priority != 0 && (s.Policy != specs.SchedFIFO && s.Policy != specs.SchedRR) { + return errors.New("scheduler.priority can only be specified for SchedFIFO or SchedRR policy") + } + if s.Policy != specs.SchedDeadline && (s.Runtime != 0 || s.Deadline != 0 || s.Period != 0) { + return errors.New("scheduler runtime/deadline/period can only be specified for SchedDeadline policy") + } + return nil +} diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index d2b3c70ad9d..176527ecc60 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -616,3 +616,53 @@ func TestValidateIDMapMounts(t *testing.T) { }) } } + +func TestValidateScheduler(t *testing.T) { + testCases := []struct { + isErr bool + policy string + niceValue int32 + priority int32 + runtime uint64 + deadline uint64 + period uint64 + }{ + {isErr: true, niceValue: 0}, + {isErr: false, policy: "SCHED_OTHER", niceValue: 19}, + {isErr: false, policy: "SCHED_OTHER", niceValue: -20}, + {isErr: true, policy: "SCHED_OTHER", niceValue: 20}, + {isErr: true, policy: "SCHED_OTHER", niceValue: -21}, + {isErr: true, policy: "SCHED_OTHER", priority: 100}, + {isErr: false, policy: "SCHED_FIFO", priority: 100}, + {isErr: true, policy: "SCHED_FIFO", runtime: 20}, + {isErr: true, policy: "SCHED_BATCH", deadline: 30}, + {isErr: true, policy: "SCHED_IDLE", period: 40}, + {isErr: true, policy: "SCHED_DEADLINE", priority: 100}, + {isErr: false, policy: "SCHED_DEADLINE", runtime: 200}, + {isErr: false, policy: "SCHED_DEADLINE", deadline: 300}, + {isErr: false, policy: "SCHED_DEADLINE", period: 400}, + } + + for _, tc := range testCases { + scheduler := configs.Scheduler{ + Policy: specs.LinuxSchedulerPolicy(tc.policy), + Nice: tc.niceValue, + Priority: tc.priority, + Runtime: tc.runtime, + Deadline: tc.deadline, + Period: tc.period, + } + config := &configs.Config{ + Rootfs: "/var", + Scheduler: &scheduler, + } + + err := Validate(config) + if tc.isErr && err == nil { + t.Errorf("scheduler: %d, expected error, got nil", tc.niceValue) + } + if !tc.isErr && err != nil { + t.Errorf("scheduler: %d, expected nil, got error %v", tc.niceValue, err) + } + } +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index d76166aafef..b9affb91c4b 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -640,6 +640,20 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { return nil } +func setupScheduler(config *configs.Config) error { + attr, err := configs.ToSchedAttr(config.Scheduler) + if err != nil { + return err + } + if err := unix.SchedSetAttr(0, attr, 0); err != nil { + if errors.Is(err, unix.EPERM) && config.Cgroups.CpusetCpus != "" { + return errors.New("process scheduler can't be used together with AllowedCPUs") + } + return fmt.Errorf("error setting scheduler: %w", err) + } + return nil +} + // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { diff --git a/libcontainer/process.go b/libcontainer/process.go index d2c7bfcda36..08c2396fe02 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -95,6 +95,8 @@ type Process struct { // // For cgroup v2, the only key allowed is "". SubCgroupPaths map[string]string + + Scheduler *configs.Scheduler } // Wait waits for the process to exit. diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 82bcfec2aa4..f3edb7100c8 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -65,6 +65,12 @@ func (l *linuxSetnsInit) Init() error { unix.Umask(int(*l.config.Config.Umask)) } + if l.config.Config.Scheduler != nil { + if err := setupScheduler(l.config.Config); err != nil { + return err + } + } + if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { return err } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index cc4e3d256ba..c5553832776 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -494,6 +494,10 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Ambient: spec.Process.Capabilities.Ambient, } } + if spec.Process.Scheduler != nil { + s := *spec.Process.Scheduler + config.Scheduler = &s + } } createHooks(spec, config) config.Version = specs.Version diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 86750700c60..4fab50c0581 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -159,6 +159,13 @@ func (l *linuxStandardInit) Init() error { return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err} } } + + if l.config.Config.Scheduler != nil { + if err := setupScheduler(l.config.Config); err != nil { + return err + } + } + // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. diff --git a/tests/integration/scheduler.bats b/tests/integration/scheduler.bats new file mode 100644 index 00000000000..c07b760d6e5 --- /dev/null +++ b/tests/integration/scheduler.bats @@ -0,0 +1,34 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires root + setup_debian +} + +function teardown() { + teardown_bundle +} + +@test "scheduler is applied" { + update_config ' .process.scheduler = {"policy": "SCHED_DEADLINE", "nice": 19, "priority": 0, "runtime": 42000, "deadline": 1000000, "period": 1000000, }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_scheduler + [ "$status" -eq 0 ] + + runc exec test_scheduler chrt -p 1 + [ "$status" -eq 0 ] + + [[ "${lines[0]}" == *"scheduling policy: SCHED_DEADLINE" ]] + [[ "${lines[1]}" == *"priority: 0" ]] + [[ "${lines[2]}" == *"runtime/deadline/period parameters: 42000/1000000/1000000" ]] +} + +@test "scheduler vs cpus" { + update_config ' .linux.resources.cpu.cpus = "0" + | .process.scheduler = {"policy": "SCHED_DEADLINE", "nice": 19, "runtime": 42000, "deadline": 1000000, "period": 1000000, }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_scheduler + [ "$status" -eq 1 ] +} diff --git a/utils_linux.go b/utils_linux.go index 0f787cb3387..b5c855cdbb9 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -61,6 +61,11 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { lp.ConsoleHeight = uint16(p.ConsoleSize.Height) } + if p.Scheduler != nil { + s := *p.Scheduler + lp.Scheduler = &s + } + if p.Capabilities != nil { lp.Capabilities = &configs.Capabilities{} lp.Capabilities.Bounding = p.Capabilities.Bounding