diff --git a/docs/spec-conformance.md b/docs/spec-conformance.md index 10496703ba1..f18f5dcfd71 100644 --- a/docs/spec-conformance.md +++ b/docs/spec-conformance.md @@ -17,4 +17,3 @@ v1.1.0-rc.1 | `SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV` | [#3862](https://github v1.1.0-rc.2 | time namespaces | [#3876](https://github.com/opencontainers/runc/pull/3876) v1.1.0-rc.2 | rsvd hugetlb cgroup | TODO ([#3859](https://github.com/opencontainers/runc/issues/3859)) v1.1.0-rc.3 | `.process.scheduler` | TODO ([#3895](https://github.com/opencontainers/runc/issues/3895)) -v1.1.0-rc.3 | `.process.ioPriority` | [#3783](https://github.com/opencontainers/runc/pull/3783) diff --git a/docs/terminals.md b/docs/terminals.md index aa9f71ee059..bec9a5fe2a9 100644 --- a/docs/terminals.md +++ b/docs/terminals.md @@ -58,7 +58,7 @@ you use `runc` directly in something like a `systemd` unit file. To disable this `LISTEN_FDS`-style passing just unset `LISTEN_FDS`. **Be very careful when passing file descriptors to a container process.** Due -to some Linux kernel (mis)features, a container with access to certain types of +to some Linux kernel misfeatures, a container with access to certain types of file descriptors (such as `O_PATH` descriptors) outside of the container's root file system can use these to break out of the container's pivoted mount namespace. [This has resulted in CVEs in the past.][CVE-2016-9962] diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index d43ea7860a2..7666e000075 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -212,6 +212,20 @@ type Config struct { // RootlessCgroups is set when unlikely to have the full access to cgroups. // When RootlessCgroups is set, cgroups errors are ignored. RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + + // IOPriority is the container's I/O priority. + IOPriority *IOPriority `json:"io_priority,omitempty"` +} + +var IOPrioClassMapping = map[specs.IOPriorityClass]int{ + specs.IOPRIO_CLASS_RT: 1, + specs.IOPRIO_CLASS_BE: 2, + specs.IOPRIO_CLASS_IDLE: 3, +} + +type IOPriority struct { + Class specs.IOPriorityClass `json:"class"` + Priority int `json:"priority"` } type ( diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 483e7a2ff3e..df3b0ab0702 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -29,6 +29,7 @@ func Validate(config *configs.Config) error { sysctl, intelrdtCheck, rootlessEUIDCheck, + ioPriority, } for _, c := range checks { if err := c(config); err != nil { @@ -286,3 +287,14 @@ func isHostNetNS(path string) (bool, error) { return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil } + +func ioPriority(config *configs.Config) error { + if config.IOPriority == nil { + return nil + } + priority := config.IOPriority.Priority + if priority < 0 || priority > 7 { + return fmt.Errorf("invalid ioPriority.Priority: %d", priority) + } + return nil +} diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index f59d0f2030c..4d14c04ac20 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -387,3 +387,32 @@ func TestValidateMounts(t *testing.T) { } } } + +func TestValidateIOPriority(t *testing.T) { + testCases := []struct { + isErr bool + priority int + }{ + {isErr: false, priority: 0}, + {isErr: false, priority: 7}, + {isErr: true, priority: -1}, + } + + for _, tc := range testCases { + ioPriroty := configs.IOPriority{ + Priority: tc.priority, + } + config := &configs.Config{ + Rootfs: "/var", + IOPriority: &ioPriroty, + } + + err := Validate(config) + if tc.isErr && err == nil { + t.Errorf("iopriority: %d, expected error, got nil", tc.priority) + } + if !tc.isErr && err != nil { + t.Errorf("iopriority: %d, expected nil, got error %v", tc.priority, err) + } + } +} diff --git a/libcontainer/process.go b/libcontainer/process.go index 8a5d340dacd..348f3ef25e3 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -89,6 +89,8 @@ type Process struct { // // For cgroup v2, the only key allowed is "". SubCgroupPaths map[string]string + + IOPriority *configs.IOPriority } // Wait waits for the process to exit. diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 48861406dba..df4f170dd0f 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -80,6 +80,12 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() + if p.process.IOPriority != nil { + if err := utils.SetIOPriority(p.process.IOPriority); err != nil { + return err + } + } + // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() err := p.cmd.Start() diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 809424a97eb..27e7c957107 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -491,6 +491,12 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Ambient: spec.Process.Capabilities.Ambient, } } + if spec.Process.IOPriority != nil { + config.IOPriority = &configs.IOPriority{ + Class: spec.Process.IOPriority.Class, + Priority: spec.Process.IOPriority.Priority, + } + } } createHooks(spec, config) config.Version = specs.Version diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index a4c01953aeb..58ce92e9f9b 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -159,6 +160,13 @@ func (l *linuxStandardInit) Init() error { return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err} } } + + if l.config.Config.IOPriority != nil { + if err := utils.SetIOPriority(l.config.Config.IOPriority); err != nil { + return err + } + } + // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 220d0b43937..c1ca3dc621c 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -7,6 +7,9 @@ import ( "fmt" "os" "strconv" + "syscall" + + "github.com/opencontainers/runc/libcontainer/configs" "golang.org/x/sys/unix" ) @@ -67,3 +70,22 @@ func NewSockPair(name string) (parent *os.File, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +const ( + IoprioWhoPgrp = 1 +) + +func SetIOPriority(ioprio *configs.IOPriority) error { + class, ok := configs.IOPrioClassMapping[ioprio.Class] + if !ok { + return fmt.Errorf("invalid io priority class: %s", ioprio.Class) + } + + // Combine class and priority into a single value + iop := (class << 13) | ioprio.Priority + _, _, errno := syscall.RawSyscall(syscall.SYS_IOPRIO_SET, IoprioWhoPgrp, 0, uintptr(iop)) + if errno != 0 { + return fmt.Errorf("failed to set io priority: %w", errno) + } + return nil +} diff --git a/tests/integration/ioprio.bats b/tests/integration/ioprio.bats new file mode 100644 index 00000000000..a907d782f01 --- /dev/null +++ b/tests/integration/ioprio.bats @@ -0,0 +1,30 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_debian +} + +function teardown() { + teardown_bundle +} + +@test "ioprio_set is applied to process group" { + # Create a container with a specific I/O priority. + update_config '.process.ioPriority = {"class": "IOPRIO_CLASS_BE", "priority": 4}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_ioprio + [ "$status" -eq 0 ] + + # Check the init process. + runc exec test_ioprio ionice -p 1 + [ "$status" -eq 0 ] + [[ "$output" = *'best-effort: prio 4'* ]] + + # Check the process made from the exec command. + runc exec test_ioprio ionice + [ "$status" -eq 0 ] + + [[ "$output" = *'best-effort: prio 4'* ]] +} diff --git a/utils_linux.go b/utils_linux.go index 4c00b2092db..d42b01a4b33 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -61,6 +61,13 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { lp.ConsoleHeight = uint16(p.ConsoleSize.Height) } + if p.IOPriority != nil { + lp.IOPriority = &configs.IOPriority{ + Class: p.IOPriority.Class, + Priority: p.IOPriority.Priority, + } + } + if p.Capabilities != nil { lp.Capabilities = &configs.Capabilities{} lp.Capabilities.Bounding = p.Capabilities.Bounding