From 454ce0969837a292841d1e1b4e8895d4b62d782c Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 17 Jan 2021 20:24:06 +1100 Subject: [PATCH 1/2] [hotfix] seccomp: default to -ENOSYS for SECCOMP_RET_ERRNO This is a hotfix to make -ENOSYS the default errno if the default action is SECCOMP_RET_ERRNO. This is necessary because glibc cannot make use of newer syscalls if we block all unknown syscalls with -EPERM (our old behaviour). Unfortunately this is not an ideal solution (syscalls with complicated rules will now return -ENOSYS rather than -EPERM) but a complete solution will require far more work -- most likely a reimplementation of libseccomp to allow us to create custom BPF filters ourselves, or large changes to libseccomp to better accommodate our requirements -- and thus this hotfix was written to solve the immediate problem while we work on a more complete solution. Signed-off-by: Aleksa Sarai --- libcontainer/seccomp/seccomp_linux.go | 61 +++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index 2370a5be664..21283a93978 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -10,8 +10,9 @@ import ( "strings" "github.com/opencontainers/runc/libcontainer/configs" - libseccomp "github.com/seccomp/libseccomp-golang" + libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -29,6 +30,49 @@ const ( syscallMaxArguments int = 6 ) +// enosysHotfixFilter adds explicit -EPERM rules for any syscall present in +// Linux 3.0 (meaning its syscall number is smaller than "setns") which did not +// have a rule in the filter. This is to avoid giving -ENOSYS for basic +// syscalls. +func enosysHotfixFilter(config *configs.Seccomp) error { + // Our default actions for hotfixed syscalls. + defaultAction := config.DefaultAction + defaultErrno := uint(unix.EPERM) + + // Collect all syscalls that had some rule. + seenSyscalls := map[string]bool{} + for _, rule := range config.Syscalls { + seenSyscalls[rule.Name] = true + } + + // And now we create unconditional rules for any syscalls not present in + // the allow list at all, up to the last syscall number (which is currently + // the last syscall added to Linux 3.0 -- "setns"). + lastSysNo, err := libseccomp.GetSyscallFromName("setns") + if err != nil { + // TODO: Maybe have a nicer fallback than this? + return errors.New("cannot find syscall number for 'setns'") + } + for sysNo := 0; sysNo <= int(lastSysNo); sysNo++ { + sysName, err := libseccomp.ScmpSyscall(sysNo).GetName() + if err != nil { + // No such syscall... + continue + } + if seenSyscalls[sysName] { + // Rule already exists. + continue + } + logrus.Debugf("seccomp hotfix: injecting blanket EPERM rule for %s", sysName) + config.Syscalls = append(config.Syscalls, &configs.Syscall{ + Name: sysName, + Action: defaultAction, + ErrnoRet: &defaultErrno, + }) + } + return nil +} + // Filters given syscalls in a container, preventing them from being used // Started in the container init process, and carried over to all child processes // Setns calls, however, require a separate invocation, as they are not children @@ -38,7 +82,18 @@ func InitSeccomp(config *configs.Seccomp) error { return errors.New("cannot initialize Seccomp - nil config passed") } - defaultAction, err := getAction(config.DefaultAction, nil) + // Default to an errno of ENOSYS as the default action if the default + // action is SECCOMP_ACT_ERRNO. This is to avoid causing glibc headaches + // when new syscalls are added. + defaultErrno := uint(unix.EPERM) + if config.DefaultAction == configs.Errno { + defaultErrno = uint(unix.ENOSYS) + if err := enosysHotfixFilter(config); err != nil { + return fmt.Errorf("error hotfixing filter: %s", err) + } + } + + defaultAction, err := getAction(config.DefaultAction, &defaultErrno) if err != nil { return errors.New("error initializing seccomp - invalid default action") } @@ -54,7 +109,6 @@ func InitSeccomp(config *configs.Seccomp) error { if err != nil { return fmt.Errorf("error validating Seccomp architecture: %s", err) } - if err := filter.AddArch(scmpArch); err != nil { return fmt.Errorf("error adding architecture to seccomp filter: %s", err) } @@ -70,7 +124,6 @@ func InitSeccomp(config *configs.Seccomp) error { if call == nil { return errors.New("encountered nil syscall while initializing Seccomp") } - if err = matchCall(filter, call); err != nil { return err } From 65138145f48a480b89c639c11ce60e4cd9520071 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 15 Jan 2021 22:23:45 +1100 Subject: [PATCH 2/2] tests: add seccomp -ENOSYS integration test Signed-off-by: Aleksa Sarai --- Dockerfile | 1 + Vagrantfile.centos7 | 2 +- Vagrantfile.fedora33 | 2 +- tests/integration/config.json | 0 tests/integration/seccomp.bats | 24 + .../testdata/seccomp_syscall_test1.c | 79 +++ .../testdata/seccomp_syscall_test1.json | 464 ++++++++++++++++++ 7 files changed, 570 insertions(+), 2 deletions(-) delete mode 100644 tests/integration/config.json create mode 100644 tests/integration/seccomp.bats create mode 100644 tests/integration/testdata/seccomp_syscall_test1.c create mode 100644 tests/integration/testdata/seccomp_syscall_test1.json diff --git a/Dockerfile b/Dockerfile index 1b38390eab6..f5ce90c0efe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,7 @@ RUN echo 'deb https://download.opensuse.org/repositories/devel:/tools:/criu/Debi crossbuild-essential-ppc64el \ curl \ gawk \ + gcc \ iptables \ jq \ kmod \ diff --git a/Vagrantfile.centos7 b/Vagrantfile.centos7 index 29fd9b1be12..c34cfa45490 100644 --- a/Vagrantfile.centos7 +++ b/Vagrantfile.centos7 @@ -22,7 +22,7 @@ Vagrant.configure("2") do |config| # install yum packages yum install -y -q epel-release (cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo) - yum install -y -q gcc git iptables jq libseccomp-devel make skopeo criu + yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make skopeo criu yum clean all # install Go diff --git a/Vagrantfile.fedora33 b/Vagrantfile.fedora33 index a32bed4a07e..dc5fc30c9c6 100644 --- a/Vagrantfile.fedora33 +++ b/Vagrantfile.fedora33 @@ -21,7 +21,7 @@ Vagrant.configure("2") do |config| config exclude kernel,kernel-core config install_weak_deps false update -install iptables gcc make golang-go libseccomp-devel bats jq git-core criu skopeo +install iptables gcc make golang-go glibc-static libseccomp-devel bats jq git-core criu skopeo ts run EOF done diff --git a/tests/integration/config.json b/tests/integration/config.json deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/integration/seccomp.bats b/tests/integration/seccomp.bats new file mode 100644 index 00000000000..267c6837ab9 --- /dev/null +++ b/tests/integration/seccomp.bats @@ -0,0 +1,24 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run [seccomp -ENOSYS handling]" { + TEST_NAME="seccomp_syscall_test1" + + # Compile the test binary and update the config to run it. + gcc -static -o rootfs/seccomp_test "${TESTDATA}/${TEST_NAME}.c" + update_config ".linux.seccomp = $(<"${TESTDATA}/${TEST_NAME}.json")" + update_config '.process.args = ["/seccomp_test"]' + + runc run test_busybox + [ "$status" -eq 0 ] +} diff --git a/tests/integration/testdata/seccomp_syscall_test1.c b/tests/integration/testdata/seccomp_syscall_test1.c new file mode 100644 index 00000000000..f2bcd89c5ca --- /dev/null +++ b/tests/integration/testdata/seccomp_syscall_test1.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int exit_code = 0; + +/* + * We need raw wrappers around each syscall so that glibc won't rewrite the + * errno value when it is returned from the seccomp filter (glibc has a habit + * of hiding -ENOSYS if possible -- which counters what we're trying to test). + */ +#define raw(name, ...) \ + syscall(SYS_ ## name, ##__VA_ARGS__) + +#define syscall_assert(sval, rval) \ + do { \ + int L = (sval), R = (rval); \ + if (L < 0) \ + L = -errno; \ + if (L != R) { \ + printf("syscall_assert(%s == %s) failed: %d != %d\n", #sval, #rval, L, R); \ + exit_code = 32; \ + } \ + } while (0) + +int main(void) +{ + // Basic permitted syscalls. + syscall_assert(write(-1, NULL, 0), -EBADF); + + // Basic syscall with masked rules. + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x000), 3); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x0FF), -EPROTONOSUPPORT); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x001), 4); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x100), -ENOSYS); // FIXME :( + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0xC00), -ENOSYS); // FIXME :( + + // Multiple arguments with OR rules. + syscall_assert(raw(process_vm_readv, 100, NULL, 0, NULL, 0, ~0), -EINVAL); + syscall_assert(raw(process_vm_readv, 9001, NULL, 0, NULL, 0, ~0), -EINVAL); + syscall_assert(raw(process_vm_readv, 0, NULL, 0, NULL, 0, ~0), -ENOSYS); // FIXME :( + syscall_assert(raw(process_vm_readv, 0, NULL, 0, NULL, 0, ~0), -ENOSYS); // FIXME :( + + // Multiple arguments with OR rules -- rule is ERRNO(-ENOANO). + syscall_assert(raw(process_vm_writev, 1337, NULL, 0, NULL, 0, ~0), -ENOANO); + syscall_assert(raw(process_vm_writev, 2020, NULL, 0, NULL, 0, ~0), -ENOANO); + syscall_assert(raw(process_vm_writev, 0, NULL, 0, NULL, 0, ~0), -ENOSYS); // FIXME :( + syscall_assert(raw(process_vm_writev, 0, NULL, 0, NULL, 0, ~0), -ENOSYS); // FIXME :( + + // Multiple arguments with AND rules. + syscall_assert(raw(kcmp, 0, 1337, 0, 0, 0), -ESRCH); + syscall_assert(raw(kcmp, 0, 0, 0, 0, 0), -ENOSYS); // FIXME :( + syscall_assert(raw(kcmp, 500, 1337, 0, 0, 0), -ENOSYS); // FIXME :( + syscall_assert(raw(kcmp, 500, 500, 0, 0, 0), -ENOSYS); // FIXME :( + + // Multiple rules for the same syscall. + syscall_assert(raw(dup3, 0, -100, 0xFFFF), -ENOSYS); // FIXME :( + syscall_assert(raw(dup3, 1, -100, 0xFFFF), -EINVAL); + syscall_assert(raw(dup3, 2, -100, 0xFFFF), -ENOSYS); // FIXME :( + syscall_assert(raw(dup3, 3, -100, 0xFFFF), -EINVAL); + + // Explicitly denied syscalls (those in Linux 3.0) get -EPERM. + syscall_assert(raw(unshare, 0), -EPERM); + syscall_assert(raw(setns, 0, 0), -EPERM); + + // Out-of-bounds fake syscall. + syscall_assert(syscall(1000, 0xDEADBEEF, 0xCAFEFEED, 0x1337), -ENOSYS); + + return exit_code; +} diff --git a/tests/integration/testdata/seccomp_syscall_test1.json b/tests/integration/testdata/seccomp_syscall_test1.json new file mode 100644 index 00000000000..c48ceae7e1d --- /dev/null +++ b/tests/integration/testdata/seccomp_syscall_test1.json @@ -0,0 +1,464 @@ +{ + "defaultAction": "SCMP_ACT_ERRNO", + "architectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32", + "SCMP_ARCH_X86_64", + "SCMP_ARCH_AARCH64", + "SCMP_ARCH_ARM" + ], + "syscalls": [ + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "arch_prctl", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "io_uring_enter", + "io_uring_register", + "io_uring_setup", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "membarrier", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "modify_ldt", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "dup3" + ], + "args": [ + { + "index": 0, + "value": 1, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "dup3" + ], + "args": [ + { + "index": 0, + "value": 2, + "op": "SCMP_CMP_GT" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "kcmp" + ], + "args": [ + { + "index": 0, + "value": 0, + "op": "SCMP_CMP_EQ" + }, + { + "index": 1, + "value": 1337, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "process_vm_readv" + ], + "args": [ + { + "index": 0, + "value": 100, + "op": "SCMP_CMP_EQ" + }, + { + "index": 0, + "value": 9001, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ERRNO", + "errnoRet": 55, + "names": [ + "process_vm_writev" + ], + "args": [ + { + "index": 0, + "value": 1337, + "op": "SCMP_CMP_EQ" + }, + { + "index": 0, + "value": 2020, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "clone" + ], + "args": [ + { + "index": 0, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "socket" + ], + "args": [ + { + "index": 2, + "value": 3840, + "valueTwo": 0, + "op": "SCMP_CMP_MASKED_EQ" + } + ] + } + ] +} +