diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..635bbbb5 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,68 @@ +--- +# see https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions +name: Static check and build project +on: + - push + - pull_request + +jobs: + build: + runs-on: ubuntu-20.04 + steps: + - name: Install Go + uses: actions/setup-go@v2 + with: + go-version: 1.16.x + + - name: Checkout code + uses: actions/checkout@v2 + + - name: Check format of go sources + run: | + go fmt ./... + git diff --exit-code + + - name: Check format of shell scripts + run: | + GO111MODULE=off go get -u mvdan.cc/sh/v3/cmd/shfmt + find . -name \*.sh | xargs shfmt -d + + - name: Lint go sources + run: | + GO111MODULE=off go get -u golang.org/x/lint/golint + golint -set_exit_status ./... + + - name: Install dependencies + run: | + sudo add-apt-repository ppa:ubuntu-lxc/daily -y + sudo apt-get install -qq lxc-dev libc6-dev pkg-config make + + - name: Build + run: | + make build + sudo -E "PATH=$PATH" make install + + - name: Run staticcheck + run: | + GO111MODULE=off go get -u honnef.co/go/tools/cmd/staticcheck + staticcheck ./... + + + - name: Test unprivileged + run: | + # keep PATH to use go installed through actions/setup-go@v2 + # and not the system version (which is currently go 1.15.x) + sudo /bin/sh -c "echo '$(whoami):1000:1' >> /etc/subuid" + sudo /bin/sh -c "echo '$(whoami):20000:65536' >> /etc/subuid" + sudo /bin/sh -c "echo '$(whoami):1000:1' >> /etc/subgid" + sudo /bin/sh -c "echo '$(whoami):20000:65536' >> /etc/subgid" + sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup/unified$(cat /proc/self/cgroup | grep '^0:' | cut -d: -f3) + # detect file descriptor leaks + ulimit -n 30 + TESTCOUNT=10 make test + + - name: Test privileged + run: | + # keep PATH to use go installed through actions/setup-go@v2 + # and not the system version (which is currently go 1.15.x) + sudo -E "PATH=$PATH" make test diff --git a/.gitignore b/.gitignore index f940ec22..c1df613a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *~ -crio-lxc -crio-lxc-test* -oci/ -roots/ +/lxcri +/lxcri-start +/lxcri-init +/lxcri-hook +/lxcri-test +/lxcri-hook-builtin .stacker/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..c4ad7043 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +FROM ubuntu:latest +ARG installcmd=install_all + +#ENV PKGS="psmisc util-linux" + +ENV GOLANG_SRC=https://golang.org/dl/go1.16.2.linux-amd64.tar.gz +ENV GOLANG_CHECKSUM=542e936b19542e62679766194364f45141fde55169db2d8d01046555ca9eb4b8 + +ENV CNI_PLUGINS_GIT_REPO=https://github.com/containernetworking/plugins.git +ENV CNI_PLUGINS_GIT_VERSION=v0.9.1 + +ENV CONMON_GIT_REPO=https://github.com/containers/conmon.git +ENV CONMON_GIT_VERSION=v2.0.27 + +ENV CRIO_GIT_REPO=https://github.com/cri-o/cri-o.git +ENV CRIO_GIT_VERSION=v1.20.1 + +ENV CRICTL_CHECKSUM=44d5f550ef3f41f9b53155906e0229ffdbee4b19452b4df540265e29572b899c +ENV CRICTL_URL="https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.20.0/crictl-v1.20.0-linux-amd64.tar.gz" + +# see https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.20.md +ENV K8S_CHECKSUM=37738bc8430b0832f32c6d13cdd68c376417270568cd9b31a1ff37e96cfebcc1e2970c72bed588f626e35ed8273671c77200f0d164e67809b5626a2a99e3c5f5 +ENV K8S_URL="https://dl.k8s.io/v1.20.4/kubernetes-server-linux-amd64.tar.gz" + +## development +ENV LXC_GIT_REPO=https://github.com/lxc/lxc.git +ENV LXC_GIT_VERSION=master + +ENV LXCRI_GIT_REPO=https://github.com/drachenfels-de/lxcri.git +ENV LXCRI_GIT_VERSION=main + +COPY install.sh / +RUN /install.sh ${installcmd} diff --git a/Makefile b/Makefile index 1d8f2b27..5f3a0723 100644 --- a/Makefile +++ b/Makefile @@ -1,26 +1,67 @@ -GO_SRC=$(shell find . -name \*.go) -COMMIT_HASH=$(shell git rev-parse HEAD) -COMMIT=$(if $(shell git status --porcelain --untracked-files=no),$(COMMIT_HASH)-dirty,$(COMMIT_HASH)) -TEST?=$(patsubst test/%.bats,%,$(wildcard test/*.bats)) -PACKAGES_DIR?=~/packages +COMMIT_HASH = $(shell git describe --always --tags --long) +COMMIT = $(if $(shell git status --porcelain --untracked-files=no),$(COMMIT_HASH)-dirty,$(COMMIT_HASH)) +BINS := lxcri +LIBEXEC_BINS := lxcri-start lxcri-init lxcri-hook lxcri-hook-builtin +# Installation prefix for BINS +PREFIX ?= /usr/local +export PREFIX +LIBEXEC_DIR = $(PREFIX)/libexec/lxcri +export LIBEXEC_DIR +PKG_CONFIG_PATH ?= $(PREFIX)/lib/pkgconfig +# Note: The default pkg-config directory is search after PKG_CONFIG_PATH +# Note: (Exported) environment variables are NOT visible in the environment of the $(shell ...) function. +export PKG_CONFIG_PATH +LDFLAGS=-X main.version=$(COMMIT) -X main.libexecDir=$(LIBEXEC_DIR) +CC ?= cc +SHELL_SCRIPTS = $(shell find . -name \*.sh) +GO_SRC = $(shell find . -name \*.go | grep -v _test.go) +TESTCOUNT ?= 1 -lint: - golangci-lint run -c ./lint.yaml ./... +all: fmt test -crio-lxc: $(GO_SRC) - go build -ldflags "-X main.version=$(COMMIT)" -o crio-lxc ./cmd +update-tools: + GO111MODULE=off go get -u mvdan.cc/sh/v3/cmd/shfmt + GO111MODULE=off go get -u golang.org/x/lint/golint + GO111MODULE=off go get -u honnef.co/go/tools/cmd/staticcheck -# make test TEST=basic will run only the basic test. -.PHONY: check -check: crio-lxc - go fmt ./... && ([ -z $(TRAVIS) ] || git diff --quiet) - go test ./... - PACKAGES_DIR=$(PACKAGES_DIR) sudo -E "PATH=$$PATH" bats -t $(patsubst %,test/%.bats,$(TEST)) +fmt: + go fmt ./... + shfmt -w $(SHELL_SCRIPTS) + golint ./... + go mod tidy + staticcheck ./... -.PHONY: vendorup -vendorup: - go get -u +.PHONY: test +test: build + go build ./pkg/internal/lxcri-test + go test --failfast --count $(TESTCOUNT) -v ./... + +build: $(BINS) $(LIBEXEC_BINS) + +lxcri: go.mod $(GO_SRC) Makefile + go build -ldflags '$(LDFLAGS)' -o $@ ./cmd/lxcri + +lxcri-start: cmd/lxcri-start/lxcri-start.c + $(CC) -Werror -Wpedantic -o $@ $? $$(pkg-config --libs --cflags lxc) + +lxcri-init: go.mod $(GO_SRC) Makefile + CGO_ENABLED=0 go build -o $@ ./cmd/lxcri-init + # this is paranoia - but ensure it is statically compiled + ! ldd $@ 2>/dev/null + +lxcri-hook: go.mod $(GO_SRC) Makefile + go build -o $@ ./cmd/$@ + +lxcri-hook-builtin: go.mod $(GO_SRC) Makefile + go build -o $@ ./cmd/$@ + +install: build + mkdir -p $(PREFIX)/bin + cp -v $(BINS) $(PREFIX)/bin + mkdir -p $(LIBEXEC_DIR) + cp -v $(LIBEXEC_BINS) $(LIBEXEC_DIR) .PHONY: clean clean: - -rm -f crio-lxc + -rm -f $(BINS) $(LIBEXEC_BINS) + diff --git a/README.md b/README.md index fbce9fd6..fa972b86 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,28 @@ -# crio-lxc +# About -This is a wrapper around [LXC](https://github.com/lxc/lxc) which can be used as -a drop-in container runtime replacement for use by -[CRI-O](https://github.com/kubernetes-sigs/cri-o). +`lxcri` is a wrapper around [LXC](https://github.com/lxc/lxc) which can be used as +a drop-in container runtime replacement for use by [CRI-O](https://github.com/kubernetes-sigs/cri-o). -To use this, simply build it: +### OCI compliance -``` -make -``` +With liblxc >= https://github.com/lxc/lxc/commit/b5daeddc5afce1cad4915aef3e71fdfe0f428709 +it passes all sonobuoy conformance tests. -Then specify the `crio-lxc` binary you just built as the value for -`default_runtime` in the `crio.runtime` section of `/etc/crio/crio.conf`. +## Installation -## Notes +For the installation of the runtime see [install.md](doc/install.md)
+For the installation and initialization of a kubernetes cluster see [kubernetes.md](doc/kubernetes.md) -Note that you must have a new enough liblxc, one which supports the -"lxc.rootfs.managed" key. 3.0.3 is not new enough, 3.1 is. On Ubuntu, -you can upgrade using the ubuntu-lxc/lxc-git-master PPA. Arch and -OpenSUSE tumbleweed should be uptodate. +## Bugs -## Tests +* cli: --help shows environment values not defaults https://github.com/urfave/cli/issues/1206 -To run the 'basic' test, you'll need to build cri-o and CNI. +## Requirements and restrictions -``` -mkdir ~/packages -cd packages -git clone https://github.com/kubernetes-sigs/cri-o -cd cri-o -make -cd .. -git clone https://github.com/containernetworking/cni -git clone https://github.com/containernetworking/plugins cni-plugins -cd cni-plugins -./build_linux.sh -``` +* Only cgroupv2 (unified cgroup hierarchy) is supported. +* A recent kernel >= 5.8 is required for full cgroup support. -You'll also need crictl. Download the tarball, extract it, and -copy crictl to somewhere in your path: +### Unimplemented features -``` -wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.14.0/crictl-v1.14.0-linux-amd64.tar.gz -tar zxf crictl-v1.14.0-linux-amd64.tar.gz -sudo cp crictl /usr/local/bin # or ~/.local/bin, etc. -``` - -You'll also need conntrack installed: - -``` -apt install conntrack -``` +* [runtime: Implement POSIX platform hooks](https://github.com/Drachenfels-GmbH/lxcri/issues/10) +* [runtime: Implement cgroup2 resource limits](https://github.com/Drachenfels-GmbH/lxcri/issues/11) diff --git a/cgroup.go b/cgroup.go new file mode 100644 index 00000000..c9e5e6d0 --- /dev/null +++ b/cgroup.go @@ -0,0 +1,453 @@ +package lxcri + +import ( + "context" + "fmt" + "io/fs" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + //"github.com/fsnotify/fsnotify" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +var cgroupRoot = "/sys/fs/cgroup" + +func detectCgroupRoot() (string, error) { + var cgroupRoot string + if err := isFilesystem("/sys/fs/cgroup", "cgroup2"); err == nil { + cgroupRoot = "/sys/fs/cgroup" + } + if err := isFilesystem("/sys/fs/cgroup/unified", "cgroup2"); err == nil { + cgroupRoot = "/sys/fs/cgroup/unified" + } + + // TODO use /proc/self/mounts to detect cgroupv2 root ! + + if os.Getuid() == 0 { + if cgroupRoot == "" { + return "", fmt.Errorf("failed to detect cgroupv2 root") + } + return cgroupRoot, nil + } + + // Use the cgroup path of the runtime user if unprivileged. + data, err := os.ReadFile("/proc/self/cgroup") + if err != nil { + return cgroupRoot, fmt.Errorf("failed to load /proc/self/cgroup: %s", err) + } + lines := strings.Split(string(data), "\n") + // get cgroup path from '0::/user.slice/user-0.slice/session-52.scope' + for _, line := range lines { + vals := strings.SplitN(line, ":", 3) + if len(vals) == 3 && vals[0] == "0" { + return filepath.Join(cgroupRoot, vals[2]), nil + } + } + return cgroupRoot, fmt.Errorf("failed to parse cgroup from /proc/self/cgroup") +} + +// checkCgroup checks if the cgroup of the container is non-empty. +func checkCgroup(c *Container) error { + ev, err := parseCgroupEvents(filepath.Join(cgroupRoot, c.CgroupDir, "cgroup.events")) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to parse cgroup events: %w", err) + } + if err == nil && ev.populated { + return fmt.Errorf("container cgroup %s is not empty", c.CgroupDir) + } + return nil +} + +// https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config-linux.md +// TODO New spec will contain a property Unified for cgroupv2 properties +// https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#unified +func configureCgroup(rt *Runtime, c *Container) error { + if err := configureCgroupPath(rt, c); err != nil { + return err + } + + if err := checkCgroup(c); err != nil { + return err + } + + if devices := c.Spec.Linux.Resources.Devices; devices != nil { + if rt.Features.CgroupDevices { + if err := configureDeviceController(c); err != nil { + return err + } + } else { + c.Log.Warn().Msg("cgroup device controller feature is disabled - access to all devices is granted") + } + + } + + if mem := c.Spec.Linux.Resources.Memory; mem != nil { + c.Log.Debug().Msg("TODO cgroup memory controller not implemented") + } + + if cpu := c.Spec.Linux.Resources.CPU; cpu != nil { + if err := configureCPUController(rt, cpu); err != nil { + return err + } + } + + if pids := c.Spec.Linux.Resources.Pids; pids != nil { + if err := c.SetConfigItem("lxc.cgroup2.pids.max", fmt.Sprintf("%d", pids.Limit)); err != nil { + return err + } + } + if blockio := c.Spec.Linux.Resources.BlockIO; blockio != nil { + c.Log.Debug().Msg("TODO cgroup blockio controller not implemented") + } + + if hugetlb := c.Spec.Linux.Resources.HugepageLimits; hugetlb != nil { + // set Hugetlb limit (in bytes) + c.Log.Debug().Msg("TODO cgroup hugetlb controller not implemented") + } + if net := c.Spec.Linux.Resources.Network; net != nil { + c.Log.Debug().Msg("TODO cgroup network controller not implemented") + } + return nil +} + +func configureCgroupPath(rt *Runtime, c *Container) error { + if rt.SystemdCgroup { + c.CgroupDir = parseSystemdCgroupPath(c.Spec.Linux.CgroupsPath) + } else { + c.CgroupDir = c.Spec.Linux.CgroupsPath + } + + if err := c.SetConfigItem("lxc.cgroup.relative", "0"); err != nil { + return err + } + + // @since lxc @a900cbaf257c6a7ee9aa73b09c6d3397581d38fb + // checking for on of the config items shuld be enough, because they were introduced together ... + // lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor + splitCgroup := c.SupportsConfigItem("lxc.cgroup.dir.container", "lxc.cgroup.dir.monitor") + + if !splitCgroup || rt.MonitorCgroup == "" { + return c.SetConfigItem("lxc.cgroup.dir", c.CgroupDir) + } + + c.MonitorCgroupDir = filepath.Join(rt.MonitorCgroup, c.ContainerID+".scope") + + if err := c.SetConfigItem("lxc.cgroup.dir.container", c.CgroupDir); err != nil { + return err + } + if err := c.SetConfigItem("lxc.cgroup.dir.monitor", c.MonitorCgroupDir); err != nil { + return err + } + + if c.SupportsConfigItem("lxc.cgroup.dir.monitor.pivot") { + if err := c.SetConfigItem("lxc.cgroup.dir.monitor.pivot", rt.MonitorCgroup); err != nil { + return err + } + } + return nil + +} + +func configureDeviceController(c *Container) error { + devicesAllow := "lxc.cgroup2.devices.allow" + devicesDeny := "lxc.cgroup2.devices.deny" + + // Set cgroup device permissions from spec. + // Device rule parsing in LXC is not well documented in lxc.container.conf + // see https://github.com/lxc/lxc/blob/79c66a2af36ee8e967c5260428f8cdb5c82efa94/src/lxc/cgroups/cgfsng.c#L2545 + // Mixing allow/deny is not permitted by lxc.cgroup2.devices. + // Best practise is to build up an allow list to disable access restrict access to new/unhandled devices. + + anyDevice := "" + blockDevice := "b" + charDevice := "c" + + for _, dev := range c.Spec.Linux.Resources.Devices { + key := devicesDeny + if dev.Allow { + key = devicesAllow + } + + maj := "*" + if dev.Major != nil { + maj = fmt.Sprintf("%d", *dev.Major) + } + + min := "*" + if dev.Minor != nil { + min = fmt.Sprintf("%d", *dev.Minor) + } + + switch dev.Type { + case anyDevice: + // do not deny any device, this will also deny access to default devices + if !dev.Allow { + continue + } + // decompose + val := fmt.Sprintf("%s %s:%s %s", blockDevice, maj, min, dev.Access) + if err := c.SetConfigItem(key, val); err != nil { + return err + } + val = fmt.Sprintf("%s %s:%s %s", charDevice, maj, min, dev.Access) + if err := c.SetConfigItem(key, val); err != nil { + return err + } + case blockDevice, charDevice: + val := fmt.Sprintf("%s %s:%s %s", dev.Type, maj, min, dev.Access) + if err := c.SetConfigItem(key, val); err != nil { + return err + } + default: + return fmt.Errorf("invalid cgroup2 device - invalid type (allow:%t %s %s:%s %s)", dev.Allow, dev.Type, maj, min, dev.Access) + } + } + return nil +} + +func configureCPUController(clxc *Runtime, slinux *specs.LinuxCPU) error { + // CPU resource restriction configuration + // use strconv.FormatUint(n, 10) instead of fmt.Sprintf ? + clxc.Log.Debug().Msg("TODO configure cgroup cpu controller") + /* + if cpu.Shares != nil && *cpu.Shares > 0 { + if err := clxc.SetConfigItem("lxc.cgroup2.cpu.shares", fmt.Sprintf("%d", *cpu.Shares)); err != nil { + return err + } + } + if cpu.Quota != nil && *cpu.Quota > 0 { + if err := clxc.SetConfigItem("lxc.cgroup2.cpu.cfs_quota_us", fmt.Sprintf("%d", *cpu.Quota)); err != nil { + return err + } + } + if cpu.Period != nil && *cpu.Period != 0 { + if err := clxc.SetConfigItem("lxc.cgroup2.cpu.cfs_period_us", fmt.Sprintf("%d", *cpu.Period)); err != nil { + return err + } + } + if cpu.Cpus != "" { + if err := clxc.SetConfigItem("lxc.cgroup2.cpuset.cpus", cpu.Cpus); err != nil { + return err + } + } + if cpu.RealtimePeriod != nil && *cpu.RealtimePeriod > 0 { + if err := clxc.SetConfigItem("lxc.cgroup2.cpu.rt_period_us", fmt.Sprintf("%d", *cpu.RealtimePeriod)); err != nil { + return err + } + } + if cpu.RealtimeRuntime != nil && *cpu.RealtimeRuntime > 0 { + if err := clxc.SetConfigItem("lxc.cgroup2.cpu.rt_runtime_us", fmt.Sprintf("%d", *cpu.RealtimeRuntime)); err != nil { + return err + } + } + */ + // Mems string `json:"mems,omitempty"` + return nil +} + +// https://kubernetes.io/docs/setup/production-environment/container-runtimes/ +// kubelet --cgroup-driver systemd --cgroups-per-qos +// kubernetes creates the cgroup hierarchy which can be changed by serveral cgroup related flags. +// kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod87f8bc68_7c18_4a1d_af9f_54eff815f688.slice +// kubepods-burstable-pod9da3b2a14682e1fb23be3c2492753207.slice:crio:fe018d944f87b227b3b7f86226962639020e99eac8991463bf7126ef8e929589 +// https://github.com/cri-o/cri-o/issues/2632 +// TODO Where is the systemd cgroup path encoding officially documented? +func parseSystemdCgroupPath(s string) string { + parts := strings.Split(s, ":") + + var cgPath []string + + for i, r := range parts[0] { + if r == '-' && i > 0 { + cgPath = append(cgPath, parts[0][0:i]+".slice") + } + } + cgPath = append(cgPath, parts[0]) + if len(parts) > 1 { + cgPath = append(cgPath, strings.Join(parts[1:], "-")+".scope") + } + return filepath.Join(cgPath...) +} + +// killCgroup freezes the cgroups of the given container +// and sends the given signal sig to all cgroup members. +func killCgroup(ctx context.Context, c *Container, sig unix.Signal) error { + if c.CgroupDir == "" { + return nil + } + rootDir := filepath.Join(cgroupRoot, c.CgroupDir) + eventsFile := filepath.Join(rootDir, "cgroup.events") + + ev, err := parseCgroupEvents(eventsFile) + if err != nil { + return err + } + if !ev.populated { + return nil + } + + freezer := filepath.Join(rootDir, "cgroup.freeze") + + err = cgroupFreeze(freezer, true) + if err != nil { + return err + } + + err = pollCgroupEvents(ctx, eventsFile, func(ev cgroupEvents) bool { + return ev.frozen + }) + if err != nil { + return err + } + + err = filepath.Walk(rootDir, func(path string, info fs.FileInfo, err error) error { + if err != nil { + return err + } + if info.Name() != "cgroup.procs" { + return nil + } + procsData, err := os.ReadFile(path) + if err != nil { + return err + } + // cgroup.procs contains one PID per line and is newline separated. + // A trailing newline is always present. + s := strings.TrimSpace(string(procsData)) + if s == "" { + return nil + } + vals := strings.Split(s, "\n") + + c.Log.Debug().Msgf("killing %d cgroup procs: %s", len(vals), vals) + for _, s := range vals { + pid, err := strconv.Atoi(s) + if err != nil { + c.Log.Error().Msgf("failed to convert PID %q to number: %s", s, err) + continue + } + // do not kill the monitor process + if pid == c.Pid { + continue + } + err = unix.Kill(pid, sig) + if err != nil && err != unix.ESRCH { + c.Log.Error().Msgf("failed to kill %d: %s", pid, err) + continue + } + } + return nil + }) + + if err != nil { + return err + } + + err = cgroupFreeze(freezer, false) + if err != nil { + return err + } + + return nil +} + +type cgroupEvents struct { + frozen bool + populated bool +} + +func parseCgroupEvents(filename string) (cgroupEvents, error) { + ev := cgroupEvents{} + data, err := os.ReadFile(filename) + if err != nil { + return ev, err + } + lines := strings.Split(string(data), "\n") + for _, line := range lines { + switch line { + case "populated 0": + ev.populated = false + case "populated 1": + ev.populated = true + case "frozen 0": + ev.frozen = false + case "frozen 1": + ev.frozen = true + } + } + return ev, nil +} + +func cgroupFreeze(filename string, freeze bool) error { + f, err := os.OpenFile(filename, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + if freeze { + _, err = f.Write([]byte("1")) + } else { + _, err = f.Write([]byte("0")) + } + return err +} + +func pollCgroupEvents(ctx context.Context, eventsFile string, fn func(ev cgroupEvents) bool) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + ev, err := parseCgroupEvents(eventsFile) + if err != nil { + return err + } + if fn(ev) { + return nil + } + time.Sleep(time.Millisecond * 5) + } + } +} + +func deleteCgroup(cgroupName string) error { + return deleteCgroupRecursive(cgroupName, 0, 10) +} + +func deleteCgroupRecursive(cgroupName string, level, max int) error { + if level == max { + return fmt.Errorf("reached max recursion of %d", max) + } + dirName := filepath.Join(cgroupRoot, cgroupName) + dir, err := os.Open(dirName) + if err != nil { + return err + } + entries, err := dir.Readdir(-1) + if err := dir.Close(); err != nil { + return err + } + if err != nil { + return err + } + for _, i := range entries { + if !i.IsDir() { + continue + } + name := i.Name() + if name == "." || name == ".." { + continue + } + childGroup := filepath.Join(cgroupName, name) + err := deleteCgroupRecursive(childGroup, level+1, max) + if err != nil { + return err + } + } + return unix.Rmdir(dirName) +} diff --git a/cgroup_test.go b/cgroup_test.go new file mode 100644 index 00000000..9b1b7d45 --- /dev/null +++ b/cgroup_test.go @@ -0,0 +1,13 @@ +package lxcri + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseSystemCgroupPath(t *testing.T) { + s := "kubepods-burstable-123.slice:crio:ABC" + cg := parseSystemdCgroupPath(s) + require.Equal(t, "kubepods.slice/kubepods-burstable.slice/kubepods-burstable-123.slice/crio-ABC.scope", cg) +} diff --git a/cmd/create.go b/cmd/create.go deleted file mode 100644 index f0f20a51..00000000 --- a/cmd/create.go +++ /dev/null @@ -1,327 +0,0 @@ -package main - -import ( - "fmt" - "golang.org/x/sys/unix" - - "io/ioutil" - "os" - "os/exec" - "path" - "path/filepath" - "regexp" - "strings" - "time" - - "github.com/apex/log" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/urfave/cli" - - lxc "gopkg.in/lxc/go-lxc.v2" -) - -var createCmd = cli.Command{ - Name: "create", - Usage: "create a container from a bundle directory", - ArgsUsage: "", - Action: doCreate, - Flags: []cli.Flag{ - cli.StringFlag{ - Name: "bundle", - Usage: "set bundle directory", - Value: ".", - }, - cli.IntFlag{ - Name: "console-socket", - Usage: "pty master FD", // TODO not handled yet - }, - cli.StringFlag{ - Name: "pid-file", - Usage: "path to write container PID", // TODO not handled yet - }, - }, -} - -// maps from CRIO namespace names to LXC names -var NamespaceMap = map[string]string{ - "cgroup": "cgroup", - "ipc": "ipc", - "mount": "mnt", - "network": "net", - "pid": "pid", - "user": "user", - "uts": "uts", -} - -func ensureShell(rootfs string) error { - shPath := filepath.Join(rootfs, "bin/sh") - if exists, _ := pathExists(shPath); exists { - return nil - } - var err error - err = RunCommand("mkdir", filepath.Join(rootfs, "bin")) - if err != nil { - return errors.Wrapf(err, "Failed doing mkdir") - } - err = RunCommand("cp", "/bin/busybox", filepath.Join(rootfs, "bin/")) - if err != nil { - return errors.Wrapf(err, "Failed copying busybox") - } - err = RunCommand("ln", filepath.Join(rootfs, "bin/busybox"), filepath.Join(rootfs, "bin/stat")) - if err != nil { - return errors.Wrapf(err, "Failed linking stat") - } - err = RunCommand("ln", filepath.Join(rootfs, "bin/busybox"), filepath.Join(rootfs, "bin/sh")) - if err != nil { - return errors.Wrapf(err, "Failed linking sh") - } - err = RunCommand("ln", filepath.Join(rootfs, "bin/busybox"), filepath.Join(rootfs, "bin/tee")) - if err != nil { - return errors.Wrapf(err, "Failed linking tee") - } - return nil -} - -const ( - SYNC_FIFO_PATH = "/syncfifo" - SYNC_FIFO_CONTENT = "meshuggah rocks" -) - -func emitFifoWaiter(file string) error { - fifoWaiter := fmt.Sprintf(`#!/bin/sh -stat /syncfifo -echo "%s" | tee /syncfifo -exec $@ -`, SYNC_FIFO_CONTENT) - - return ioutil.WriteFile(file, []byte(fifoWaiter), 0755) -} - -func configureNamespaces(c *lxc.Container, spec *specs.Spec) error { - procPidPathRE := regexp.MustCompile(`/proc/(\d+)/ns`) - - var nsToClone []string - var configVal string - seenNamespaceTypes := map[specs.LinuxNamespaceType]bool{} - for _, ns := range spec.Linux.Namespaces { - if _, ok := seenNamespaceTypes[ns.Type]; ok { - return fmt.Errorf("duplicate namespace type %s", ns.Type) - } - seenNamespaceTypes[ns.Type] = true - if ns.Path == "" { - nsToClone = append(nsToClone, NamespaceMap[string(ns.Type)]) - } else { - configKey := fmt.Sprintf("lxc.namespace.share.%s", NamespaceMap[string(ns.Type)]) - - matches := procPidPathRE.FindStringSubmatch(ns.Path) - switch len(matches) { - case 0: - configVal = ns.Path - case 1: - return fmt.Errorf("error parsing namespace path. expected /proc/(\\d+)/ns/*, got '%s'", ns.Path) - case 2: - configVal = matches[1] - default: - return fmt.Errorf("error parsing namespace path. expected /proc/(\\d+)/ns/*, got '%s'", ns.Path) - } - - if err := c.SetConfigItem(configKey, configVal); err != nil { - return errors.Wrapf(err, "failed to set namespace config: '%s'='%s'", configKey, configVal) - } - } - } - - if len(nsToClone) > 0 { - configVal = strings.Join(nsToClone, " ") - if err := c.SetConfigItem("lxc.namespace.clone", configVal); err != nil { - return errors.Wrapf(err, "failed to set lxc.namespace.clone=%s", configVal) - } - } - return nil -} - -func doCreate(ctx *cli.Context) error { - pidfile := ctx.String("pid-file") - containerID := ctx.Args().Get(0) - if len(containerID) == 0 { - fmt.Fprintf(os.Stderr, "missing container ID\n") - cli.ShowCommandHelpAndExit(ctx, "create", 1) - } - log.Infof("creating container %s", containerID) - - exists, err := containerExists(containerID) - if err != nil { - return errors.Wrap(err, "failed to check if container exists") - } - if exists { - return fmt.Errorf("container '%s' already exists", containerID) - } - - c, err := lxc.NewContainer(containerID, LXC_PATH) - if err != nil { - return errors.Wrap(err, "failed to create new container") - } - defer c.Release() - - spec, err := readBundleSpec(filepath.Join(ctx.String("bundle"), "config.json")) - if err != nil { - return errors.Wrap(err, "couldn't load bundle spec") - } - - if err := os.MkdirAll(filepath.Join(LXC_PATH, containerID), 0770); err != nil { - return errors.Wrap(err, "failed to create container dir") - } - - if err := makeSyncFifo(filepath.Join(LXC_PATH, containerID)); err != nil { - return errors.Wrap(err, "failed to make sync fifo") - } - - if err := configureContainer(ctx, c, spec); err != nil { - return errors.Wrap(err, "failed to configure container") - } - - log.Infof("created syncfifo, executing %#v", spec.Process.Args) - - if err := startContainer(c, spec); err != nil { - return errors.Wrap(err, "failed to start the container init") - } - - if pidfile != "" { - err := os.MkdirAll(path.Dir(pidfile), 0755) - if err != nil { - return errors.Wrapf(err, "Couldn't create pid file directory for %s", pidfile) - } - err = ioutil.WriteFile(pidfile, []byte(fmt.Sprintf("%d", c.InitPid())), 0755) - if err != nil { - return errors.Wrapf(err, "Couldn't create pid file %s", pidfile) - } - } - - log.Infof("created container %s in lxcdir %s", containerID, LXC_PATH) - return nil -} - -func configureContainer(ctx *cli.Context, c *lxc.Container, spec *specs.Spec) error { - if ctx.Bool("debug") { - c.SetVerbosity(lxc.Verbose) - } - - if err := configureLogging(ctx, c); err != nil { - return errors.Wrap(err, "failed to configure logging") - } - - // rootfs - // todo Root.Readonly? - use lxc.rootfs.options - if err := c.SetConfigItem("lxc.rootfs.path", spec.Root.Path); err != nil { - return errors.Wrapf(err, "failed to set rootfs: '%s'", spec.Root.Path) - } - if err := c.SetConfigItem("lxc.rootfs.managed", "0"); err != nil { - return errors.Wrap(err, "failed to set rootfs.managed to 0") - } - - for _, envVar := range spec.Process.Env { - if err := c.SetConfigItem("lxc.environment", envVar); err != nil { - return fmt.Errorf("error setting environment variable '%s': %v", envVar, err) - } - } - - for _, ms := range spec.Mounts { - opts := strings.Join(ms.Options, ",") - mnt := fmt.Sprintf("%s %s %s %s", ms.Source, ms.Destination, ms.Type, opts) - if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { - return errors.Wrap(err, "failed to set mount config") - } - } - - mnt := fmt.Sprintf("%s %s none ro,bind,create=file", path.Join(LXC_PATH, c.Name(), SYNC_FIFO_PATH), strings.Trim(SYNC_FIFO_PATH, "/")) - if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { - return errors.Wrap(err, "failed to set syncfifo mount config entry") - } - - err := emitFifoWaiter(path.Join(spec.Root.Path, "fifo-wait")) - if err != nil { - return errors.Wrapf(err, "couldn't write wrapper init") - } - - if err := ensureShell(spec.Root.Path); err != nil { - return errors.Wrap(err, "couldn't ensure a shell exists in container") - } - - if err := c.SetConfigItem("lxc.init.cwd", spec.Process.Cwd); err != nil { - return errors.Wrap(err, "failed to set CWD") - } - - if err := c.SetConfigItem("lxc.uts.name", spec.Hostname); err != nil { - return errors.Wrap(err, "failed to set hostname") - } - - argsString := "/fifo-wait " + strings.Join(spec.Process.Args, " ") - if err := c.SetConfigItem("lxc.execute.cmd", argsString); err != nil { - return errors.Wrap(err, "failed to set lxc.execute.cmd") - - } - if err := c.SetConfigItem("lxc.hook.version", "1"); err != nil { - return errors.Wrap(err, "failed to set hook version") - } - - if err := configureNamespaces(c, spec); err != nil { - return errors.Wrap(err, "failed to configure namespaces") - } - - // capabilities? - - // if !spec.Process.Terminal { - // passFdsToContainer() - // } - - // Write out final config file for debugging and use with lxc-attach: - // Do not edit config after this. - savedConfigFile := filepath.Join(LXC_PATH, c.Name(), "config") - if err := c.SaveConfigFile(savedConfigFile); err != nil { - return errors.Wrapf(err, "failed to save config file to '%s'", savedConfigFile) - } - - return nil -} - -func makeSyncFifo(dir string) error { - fifoFilename := filepath.Join(dir, "syncfifo") - prevMask := unix.Umask(0000) - defer unix.Umask(prevMask) - if err := unix.Mkfifo(fifoFilename, 0622); err != nil { - return errors.Wrapf(err, "failed to make fifo '%s'", fifoFilename) - } - return nil -} - -func startContainer(c *lxc.Container, spec *specs.Spec) error { - binary, err := os.Readlink("/proc/self/exe") - if err != nil { - return err - } - - cmd := exec.Command( - binary, - "internal", - c.Name(), - LXC_PATH, - filepath.Join(LXC_PATH, c.Name(), "config"), - ) - - if !spec.Process.Terminal { - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - } - - cmdErr := cmd.Start() - - if cmdErr == nil { - if !c.Wait(lxc.RUNNING, 30*time.Second) { - cmdErr = fmt.Errorf("Container failed to initialize") - } - } - - return cmdErr -} diff --git a/cmd/defs.go b/cmd/defs.go deleted file mode 100644 index 81a1d731..00000000 --- a/cmd/defs.go +++ /dev/null @@ -1,7 +0,0 @@ -package main - -var ( - CURRENT_OCI_VERSION = "0.2.1" - // controlled by --lxc-path arg to main - LXC_PATH = "" -) diff --git a/cmd/delete.go b/cmd/delete.go deleted file mode 100644 index 02b6a69e..00000000 --- a/cmd/delete.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( - "fmt" - "os" - "path/filepath" - - "github.com/apex/log" - "github.com/pkg/errors" - "github.com/urfave/cli" - - lxc "gopkg.in/lxc/go-lxc.v2" -) - -var deleteCmd = cli.Command{ - Name: "delete", - Usage: "deletes a container", - Action: doDelete, - ArgsUsage: `[containerID] - - is the ID of the container to delete -`, - Flags: []cli.Flag{ - cli.BoolFlag{ - Name: "force", - Usage: "force deletion", - }, - }, -} - -func doDelete(ctx *cli.Context) error { - containerID := ctx.Args().Get(0) - if len(containerID) == 0 { - fmt.Fprintf(os.Stderr, "missing container ID\n") - cli.ShowCommandHelpAndExit(ctx, "state", 1) - } - - exists, err := containerExists(containerID) - if err != nil { - return errors.Wrap(err, "failed to check if container exists") - } - if !exists { - return fmt.Errorf("container '%s' not found", containerID) - } - - c, err := lxc.NewContainer(containerID, LXC_PATH) - if err != nil { - return errors.Wrap(err, "failed to load container") - } - defer c.Release() - - if err := configureLogging(ctx, c); err != nil { - return errors.Wrap(err, "failed to configure logging") - - } - - force := ctx.Bool("force") - if c.Running() { - if checkHackyPreStart(c) == "started" && !force { - return fmt.Errorf("container '%s' is running, cannot delete.", containerID) - } - if err := c.Stop(); err != nil { - log.Warnf("Failed to stop pre-started container %s: %v", containerID, err) - } - } - - // TODO: lxc-destroy deletes the rootfs. - // this appears to contradict the runtime spec: - - // "Note that resources associated with the container, - // but not created by this container, MUST NOT be deleted.Note - // that resources associated with the container, but not - // created by this container, MUST NOT be deleted. - - if err := c.Destroy(); err != nil { - return errors.Wrap(err, "failed to delete container.") - } - - // TODO - because we set rootfs.managed=0, Destroy() doesn't - // delete the /var/lib/lxc/$containerID/config file: - configDir := filepath.Join(LXC_PATH, containerID) - if err := os.RemoveAll(configDir); err != nil { - return errors.Wrapf(err, "failed to remove %s", configDir) - } - - return nil -} diff --git a/cmd/internal.go b/cmd/internal.go deleted file mode 100644 index e72b2497..00000000 --- a/cmd/internal.go +++ /dev/null @@ -1,117 +0,0 @@ -/* - * This file is a little bit strange. The problem is that we want to do - * daemonized containers with liblxc, but we can't spawn containers in threaded - * environments (i.e. golang), with go-lxc. So instead, we embed some C into - * our program that catches execution before golang starts. This way, we can do - * a tiny C program to actually spawn the container. - * - */ -package main - -// #cgo LDFLAGS: -llxc -/* -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#include - -static int spawn_container(char *name, char *lxcpath, char *config) -{ - struct lxc_container *c; - - c = lxc_container_new(name, lxcpath); - if (!c) { - fprintf(stderr, "failed to create container %s\n", name); - return -1; - } - - c->clear_config(c); - if (!c->load_config(c, config)) { - fprintf(stderr, "failed to load container config at %s\n", config); - return -1; - } - - c->daemonize = false; - if (!c->start(c, 1, NULL)) { - fprintf(stderr, "failed to start container %s\n", name); - return -1; - } - - return c->error_num; -} - -// main function for the "internal" command. Right now, arguments look like: -// argv[0] internal -__attribute__((constructor)) void internal(void) -{ - int ret, status; - char buf[4096]; - ssize_t size; - char *cur, *name, *lxcpath, *config_path; - - ret = open("/proc/self/cmdline", O_RDONLY); - if (ret < 0) { - perror("error: open"); - exit(96); - } - - if ((size = read(ret, buf, sizeof(buf)-1)) < 0) { - close(ret); - perror("error: read"); - exit(96); - } - close(ret); - - // /proc/self/cmdline is null separated, but let's be real safe - buf[size] = 0; - cur = buf; - -#define ADVANCE_ARG \ - do { \ - while (*cur) { \ - cur++; \ - } \ - cur++; \ - } while (0) - - // skip argv[0] - ADVANCE_ARG; - - // is this really the internal command, if not, continue normal execution - if (strcmp(cur, "internal")) - return; - - ADVANCE_ARG; - name = cur; - ADVANCE_ARG; - lxcpath = cur; - ADVANCE_ARG; - config_path = cur; - - ret = isatty(STDIN_FILENO); - if (ret < 0) { - perror("isatty"); - exit(96); - } - - // If this is non interactive, get rid of our controlling terminal, - // since we don't want lxc's setting of ISIG to ignore user's ^Cs. - if (!ret) - setsid(); - - status = spawn_container(name, lxcpath, config_path); - - // Try and propagate the container's exit code. - if (WIFEXITED(status)) { - exit(WEXITSTATUS(status)); - } else { - kill(0, WTERMSIG(status)); - exit(EXIT_FAILURE); - } -} -*/ -import "C" diff --git a/cmd/kill.go b/cmd/kill.go deleted file mode 100644 index 3af4ddfe..00000000 --- a/cmd/kill.go +++ /dev/null @@ -1,105 +0,0 @@ -package main - -import ( - "fmt" - "os" - "syscall" - - "golang.org/x/sys/unix" - - //"github.com/apex/log" - "github.com/pkg/errors" - "github.com/urfave/cli" - - lxc "gopkg.in/lxc/go-lxc.v2" -) - -var killCmd = cli.Command{ - Name: "kill", - Usage: "sends a signal to a container", - Action: doKill, - ArgsUsage: `[containerID] - - is the ID of the container to send a signal to -`, - Flags: []cli.Flag{ - cli.StringFlag{ - Name: "signal", - Usage: "the signal to send, as a string", - Value: "TERM", - }, - }, -} -var signalMap = map[string]syscall.Signal{ - "ABRT": unix.SIGABRT, - "ALRM": unix.SIGALRM, - "BUS": unix.SIGBUS, - "CHLD": unix.SIGCHLD, - "CLD": unix.SIGCLD, - "CONT": unix.SIGCONT, - "FPE": unix.SIGFPE, - "HUP": unix.SIGHUP, - "ILL": unix.SIGILL, - "INT": unix.SIGINT, - "IO": unix.SIGIO, - "IOT": unix.SIGIOT, - "KILL": unix.SIGKILL, - "PIPE": unix.SIGPIPE, - "POLL": unix.SIGPOLL, - "PROF": unix.SIGPROF, - "PWR": unix.SIGPWR, - "QUIT": unix.SIGQUIT, - "SEGV": unix.SIGSEGV, - "STKFLT": unix.SIGSTKFLT, - "STOP": unix.SIGSTOP, - "SYS": unix.SIGSYS, - "TERM": unix.SIGTERM, - "TRAP": unix.SIGTRAP, - "TSTP": unix.SIGTSTP, - "TTIN": unix.SIGTTIN, - "TTOU": unix.SIGTTOU, - "URG": unix.SIGURG, - "USR1": unix.SIGUSR1, - "USR2": unix.SIGUSR2, - "VTALRM": unix.SIGVTALRM, - "WINCH": unix.SIGWINCH, - "XCPU": unix.SIGXCPU, - "XFSZ": unix.SIGXFSZ, -} - -func doKill(ctx *cli.Context) error { - containerID := ctx.Args().Get(0) - if len(containerID) == 0 { - fmt.Fprintf(os.Stderr, "missing container ID\n") - cli.ShowCommandHelpAndExit(ctx, "state", 1) - } - - exists, err := containerExists(containerID) - if err != nil { - return errors.Wrap(err, "failed to check if container exists") - } - if !exists { - return fmt.Errorf("container '%s' not found", containerID) - } - - c, err := lxc.NewContainer(containerID, LXC_PATH) - if err != nil { - return errors.Wrap(err, "failed to load container") - } - defer c.Release() - - if err := configureLogging(ctx, c); err != nil { - return errors.Wrap(err, "failed to configure logging") - - } - - if c.Running() && checkHackyPreStart(c) == "started" { - pid := c.InitPid() - - if err := unix.Kill(pid, signalMap[ctx.String("signal")]); err != nil { - return errors.Wrap(err, "failed to send signal") - } - return nil - } - return fmt.Errorf("container '%s' is not running", containerID) -} diff --git a/cmd/lxcri-hook-builtin/main.go b/cmd/lxcri-hook-builtin/main.go new file mode 100644 index 00000000..ef8918f3 --- /dev/null +++ b/cmd/lxcri-hook-builtin/main.go @@ -0,0 +1,80 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +func main() { + rootfs, _, spec, err := specki.InitHook(os.Stdin) + if err != nil { + panic(err) + } + + for _, dev := range spec.Linux.Devices { + if err := createDevice(rootfs, dev); err != nil { + err := fmt.Errorf("failed to create device %s: %w", dev.Path, err) + fmt.Fprintln(os.Stderr, err.Error()) + } + } + + for _, p := range spec.Linux.MaskedPaths { + if err := maskPath(filepath.Join(rootfs, p)); err != nil { + err := fmt.Errorf("failed to mask path %s: %w", p, err) + fmt.Fprintln(os.Stderr, err.Error()) + } + } +} + +func getDeviceMode(dev specs.LinuxDevice) (uint32, error) { + var fileType uint32 + + switch dev.Type { + case "b": + fileType = unix.S_IFBLK + case "c": + fileType = unix.S_IFCHR + case "p": + fileType = unix.S_IFIFO + default: + return 0, fmt.Errorf("unsupported device type: %s", dev.Type) + } + + var perm uint32 = 0666 + if dev.FileMode != nil { + perm = uint32(*dev.FileMode) + } + return (fileType | perm), nil +} + +func createDevice(rootfs string, dev specs.LinuxDevice) error { + mode, err := getDeviceMode(dev) + if err != nil { + return err + } + + // ignored by unix.Mknod if dev.Type is not unix.S_IFBLK or unix.S_IFCHR + mkdev := int(unix.Mkdev(uint32(dev.Major), uint32(dev.Minor))) + + err = unix.Mknod(filepath.Join(rootfs, dev.Path), mode, mkdev) + if err != nil { + return fmt.Errorf("mknod failed: %s", err) + } + return nil +} + +func maskPath(p string) error { + err := unix.Mount("/dev/null", p, "", unix.MS_BIND, "") + if os.IsNotExist(err) { + return nil + } + if err == unix.ENOTDIR { + return unix.Mount("tmpfs", p, "tmpfs", unix.MS_RDONLY, "") + } + return err +} diff --git a/cmd/lxcri-hook/README.md b/cmd/lxcri-hook/README.md new file mode 100644 index 00000000..d1df1c79 --- /dev/null +++ b/cmd/lxcri-hook/README.md @@ -0,0 +1,128 @@ +# Hooks + +* see https://github.com/opencontainers/runtime-spec/blob/master/config.md + +## Notes + +The OCI hooks wrapper will work in plain lxc containers because the +OCI state (state.json, hooks.json, config.json) is not available. + +It's perfectly reasonable to run hooks directly from lxcri cli + +OCI state must be bind mounted into the container. + +## CreateRuntime + +NOTE underspecified +conditions: mount namespace have been created, mount operations performed (all ?) + +* when: before pivot_root, after namespace creation +* path: runtime namespace +* exec: runtime namespace + +* maps to: lxc.hook.pre-start ? (mounts are not created) +* lxc.hook.pre-mount ? (container's fs namespace == mount namespace ?) + +## CreateContainer + +* when: before pivot_root, after mount namespace setup +* path: runtime namespace +* exec: container namespace + +* maps to: lxc.hook.mount + +## StartContainer + +* when: before lxcri-init execs, after mounts are complete +* path: container namespace +* exec: container namespace + +* maps to: lxc.hook.start + +Run from `lxcri-init` the same way the user process is executed? + +Bind mount hook launcher into container. +Create folder with environ/cmdline files for each hook. + +## PostStart + +* when: after syncfifo is unblocked +* path: runtime namespace +* exec: runtime namespace + +* maps to: no LXC hook + +Usually this is done manually after calling `lxc-start` +Run directory after unblocking the syncfifo in Runtime.Start +Set LXC_ environment variables ? + +## PostStop + +* when: after container delete / before delete returns +* path: runtime namespace +* exec: runtime namespace + +* maps to: lxc.hook.destroy + +Run directly in Runtime.Delete + + +### Solution 1 + +Add a cli command `hooks` with the container name and the hook as argument. + +* Bad: hooks should not be accessible through the CLI because they + should only be executed within defined runtime states. + (simply hide the command from the help output ?) + +* Bad: lxcri with all libraries must be available in the container for + CreateContainer and StartContainer hooks. + +### Idea 2 + +* Update the container state in runtime commands and serialize it to the runtime directory. + +Extend / Update the state from the LXC hook environment variables. +Create a single C binary that executes the hooks from the lxc hook. + +Serialize hooks into a format that can be consumed by hooks +and started from 'liblxc' using a simple static C binary, +similar to `lxcri-init`. + +Use the same mechanism `lxcri-init` uses to exec the hook +processes. + +* Bind mount the hook directories, for hooks running in the +container namespace into the container. +e.g /.lxcri/hooks + +lxc.hook.mount = lxcri-hook create-runtime + + +e.g create + +{runtime_dir}/state.json + +{runtime_dir}/hooks/create_runtime/1/cmdline +{runtime_dir}/hooks/create_runtime/1/environ + +{runtime_dir}/hooks/create_runtime/2/cmdline +{runtime_dir}/hooks/create_runtime/2/environ + +... + +{runtime_dir}/hooks/create-container/1/cmdline +{runtime_dir}/hooks/create-container/2/environ + + + +Pass state.json to executed process. + + +c tool can iterate over contents in the hook directory +and load and execute process and cmline +for each subfolder. + +* can be implemented as go binary and as C binary .... + +* timeout: set as additional environment variable e.g OCI_HOOK_TIMEOUT diff --git a/cmd/lxcri-hook/hooks.go b/cmd/lxcri-hook/hooks.go new file mode 100644 index 00000000..2763797d --- /dev/null +++ b/cmd/lxcri-hook/hooks.go @@ -0,0 +1,90 @@ +package main + +import ( + "errors" + "os" + "strings" +) + +// HookType is the liblxc hook type. +type HookType string + +// List of liblxc hook types. +const ( + HookPreStart HookType = "pre-start" + HookPreMount HookType = "pre-mount" + HookMount HookType = "mount" + HookAutodev HookType = "autodev" + HookStartHost HookType = "start-host" + HookStart HookType = "start" + HookStop HookType = "stop" + HookPostStop HookType = "post-stop" + HookClone HookType = "clone" + HookDestroy HookType = "destroy" + //HookPostStart = "post-start" // not defined by liblxc +) + +// Env is the parsed liblxc hook environment. +type Env struct { + // CgroupAware is true if the container is cgroup namespace aware. + CgroupAware bool + // ConfigFile is the path to the container configuration file. + ConfigFile string + // Type is the hook type. + Type HookType + // Section is the hooks section type (e.g. 'lxc', 'net'). + Section string + // Version is the version of the hooks + Version string + // LogLevel is the container's log level. + LogLevel string + // ContainerName is the container's name. + ContainerName string + // SharedNamespaces maps namespace names from /proc/{pid}/ns + // to the file descriptor path referring to the container's namespace. + SharedNamespaces map[string]string + // RootfsMount is the path to the mounted root filesystem. + RootfsMount string + // RootfsPath is the lxc.rootfs.path entry for the container. + RootfsPath string + // SrcContainerName is the original container's name, + // in the case of the clone hook. + SrcContainerName string +} + +var namespaces = []string{"cgroup", "ipc", "mnt", "net", "pid", "time", "user", "uts"} + +// ErrEnv is the error returned by LoadEnv +// if the LXC_HOOK_TYPE environment variable is not set. +var ErrEnv = errors.New("LXC_HOOK_TYPE environment variable is not set") + +// LoadEnv parses all liblxc hook environment variables, +// and returns the parsed values in an Env struct. +// If `LXC_HOOK_TYPE` is not set ErrEnv will be returned. +// NOTE The environment variables in liblxc hooks are all prefixed with LXC_. +func LoadEnv() (*Env, error) { + hookType, exist := os.LookupEnv("LXC_HOOK_TYPE") + if !exist { + return nil, ErrEnv + } + + env := &Env{ + ConfigFile: os.Getenv("LXC_CONFIG_FILE"), + Type: HookType(hookType), + Section: os.Getenv("LXC_HOOK_SECTION"), + Version: os.Getenv("LXC_HOOK_VERSION"), + LogLevel: os.Getenv("LXC_LOG_LEVEL"), + ContainerName: os.Getenv("LXC_NAME"), + RootfsMount: os.Getenv("LXC_ROOTFS_MOUNT"), + RootfsPath: os.Getenv("LXC_ROOTFS_PATH"), + SrcContainerName: os.Getenv("LXC_SRC_NAME"), + } + + env.SharedNamespaces = make(map[string]string, len(namespaces)) + for _, ns := range namespaces { + if val, ok := os.LookupEnv("LXC_" + strings.ToUpper(ns) + "_NS"); ok { + env.SharedNamespaces[ns] = val + } + } + return env, nil +} diff --git a/cmd/lxcri-hook/main.go b/cmd/lxcri-hook/main.go new file mode 100644 index 00000000..20045424 --- /dev/null +++ b/cmd/lxcri-hook/main.go @@ -0,0 +1,101 @@ +package main + +import ( + "context" + "flag" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func init() { + // from `man lxc.container.conf` + // Standard output from the hooks is logged at debug level + // Standard error is not logged, but can be captured by the hook + // redirecting its standard error to standard output. + os.Stderr = os.Stdout +} + +func main() { + var timeout int + // Individual hooks should set a timeout lower than the overall timeout. + flag.IntVar(&timeout, "timeout", 30, "maximum run time in seconds allowed for all hooks") + flag.Parse() + + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second) + defer cancel() + + env, err := LoadEnv() + if err != nil { + fmt.Println(err.Error()) + os.Exit(2) + } + + err = run(ctx, env) + if err != nil { + fmt.Println(err.Error()) + os.Exit(3) + } +} + +func run(ctx context.Context, env *Env) error { + runtimeDir := filepath.Dir(env.ConfigFile) + + var hooks specs.Hooks + err := specki.DecodeJSONFile(filepath.Join(runtimeDir, "hooks.json"), &hooks) + if err != nil { + return err + } + + hooksToRun, status, err := ociHooksAndState(env.Type, &hooks) + if err != nil { + return err + } + + if len(hooksToRun) == 0 { + return fmt.Errorf("no OCI hooks defined for lxc hook %q", env.Type) + } + + // need to deserialize it to set the current specs.ContainerState + var state specs.State + err = specki.DecodeJSONFile(filepath.Join(runtimeDir, "state.json"), &state) + if err != nil { + return err + } + state.Status = status + + fmt.Printf("running OCI hooks for lxc hook %q", env.Type) + return specki.RunHooks(ctx, &state, hooksToRun, false) +} + +// https://github.com/opencontainers/runtime-spec/blob/master/specs-go/state.go +// The only value that does change is the specs.ContainerState in specs.State.Status. +// The specs.ContainerState is implied by the runtime hook. +// status, and the status is already defined by the hook itself ... +func ociHooksAndState(t HookType, hooks *specs.Hooks) ([]specs.Hook, specs.ContainerState, error) { + switch t { + case HookPreMount: + // quote from https://github.com/opencontainers/runtime-spec/blob/master/config.md#posix-platform-hooks + // > For runtimes that implement the deprecated prestart hooks as createRuntime hooks, + // > createRuntime hooks MUST be called after the prestart hooks. + if len(hooks.CreateRuntime) > 0 { + return append(hooks.Prestart, hooks.CreateRuntime...), specs.StateCreating, nil + } + return hooks.Prestart, specs.StateCreating, nil + case HookMount: + return hooks.CreateContainer, specs.StateCreating, nil + //case HookStart: + // return hooks.StartContainer, specs.StateCreated, nil + // NOTE the following hooks are executed directly from lxcri + //case HookPostStart: + // return hooks.Poststart, specs.StateRunning, nil + //case HookDestroy: + // return hooks.Poststop, specs.StateStopped, nil + default: + return nil, specs.StateStopped, fmt.Errorf("liblxc hook %q is not mapped to OCI hooks", t) + } +} diff --git a/cmd/lxcri-init/main.go b/cmd/lxcri-init/main.go new file mode 100644 index 00000000..0c29b453 --- /dev/null +++ b/cmd/lxcri-init/main.go @@ -0,0 +1,147 @@ +package main + +import ( + "context" + "fmt" + "os" + "os/exec" + "os/user" + "path/filepath" + "time" + + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +func main() { + // TODO use environment variable for runtime dir + runtimeDir, err := os.Getwd() + if err != nil { + fmt.Fprintf(os.Stderr, "failed to get runtime dir: %s\n", err) + os.Exit(2) + } + + specPath := filepath.Join(runtimeDir, "config.json") + spec, err := specki.ReadSpecJSON(specPath) + if err != nil { + fmt.Fprintf(os.Stderr, "%s\n", err.Error()) + os.Exit(3) + } + + err = doInit(runtimeDir, spec) + if err != nil { + if err := writeTerminationLog(spec, "init failed: %s\n", err); err != nil { + fmt.Fprintf(os.Stderr, "%s", err) + } + fmt.Fprintf(os.Stderr, "init failed: %s\n", err) + os.Exit(4) + } +} + +func writeTerminationLog(spec *specs.Spec, format string, a ...interface{}) error { + var terminationLog string + if spec.Annotations != nil { + terminationLog = spec.Annotations["io.kubernetes.container.terminationMessagePath"] + } + if terminationLog == "" { + return nil + } + + f, err := os.OpenFile(terminationLog, os.O_WRONLY|os.O_APPEND, 0640) + if err != nil { + return err + } + defer f.Close() + + _, err = fmt.Fprintf(f, format, a...) + if err != nil { + return fmt.Errorf("failed to write to termination log %q: %w", terminationLog, err) + } + return nil +} + +func doInit(runtimeDir string, spec *specs.Spec) error { + statePath := filepath.Join(runtimeDir, "state.json") + state, err := specki.LoadSpecStateJSON(statePath) + if err != nil { + return fmt.Errorf("failed to read spec %q: %s", statePath, err) + } + + cmdPath := spec.Process.Args[0] + val, exist := specki.Getenv(spec.Process.Env, "PATH") + if exist { + err := os.Setenv("PATH", val) + if err != nil { + return fmt.Errorf("failed to set PATH environment variable: %s", err) + } + cmdPath, err = exec.LookPath(spec.Process.Args[0]) + if err != nil { + return fmt.Errorf("lookup path for %s failed: %w", spec.Process.Args[0], err) + } + } + + _, exist = specki.Getenv(spec.Process.Env, "HOME") + if !exist { + addEnvHome(spec) + } + + err = unix.Chdir(spec.Process.Cwd) + if err != nil { + return fmt.Errorf("failed to change cwd to %s: %w", spec.Process.Cwd, err) + } + + err = readSyncfifo(filepath.Join(runtimeDir, "syncfifo")) + if err != nil { + return err + } + + // TODO use environment variable to control timeout + ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) + defer cancel() + err = specki.RunHooks(ctx, state, spec.Hooks.StartContainer, false) + if err != nil { + return err + } + + unix.Exec(cmdPath, spec.Process.Args, spec.Process.Env) + if err != nil { + return fmt.Errorf("exec failed: %w", err) + } + return nil +} + +func readSyncfifo(filename string) error { + f, err := os.OpenFile(filename, os.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("failed to open %s: %w", filename, err) + } + return f.Close() +} + +/* +func closeExtraFds() { + os.Open("/proc/self/fd") +} +*/ + +func addEnvHome(spec *specs.Spec) { + // lookup users home directory in passwd. + userName := spec.Process.User.Username + if userName != "" { + u, err := user.Lookup(userName) + if err == nil && u.HomeDir != "" { + spec.Process.Env = append(spec.Process.Env, "HOME="+u.HomeDir) + return + } + } + // If user is root without entry in /etc/passwd then try /root + if spec.Process.User.UID == 0 { + stat, err := os.Stat("/root") + if err == nil && stat.IsDir() { + spec.Process.Env = append(spec.Process.Env, "HOME=/root") + return + } + } + spec.Process.Env = append(spec.Process.Env, "HOME="+spec.Process.Cwd) +} diff --git a/cmd/lxcri-start/lxcri-start.c b/cmd/lxcri-start/lxcri-start.c new file mode 100644 index 00000000..41e84cef --- /dev/null +++ b/cmd/lxcri-start/lxcri-start.c @@ -0,0 +1,81 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include + +/* +/ Set to 0 to disable use of lxc-init. +/ The container process should have PID 1. +*/ +#define ENABLE_LXCINIT 0 + +#define ERROR(format, ...) \ + { \ + fprintf(stderr, "[lxcri-start] " format, ##__VA_ARGS__); \ + ret = EXIT_FAILURE; \ + goto out; \ + } + +/* NOTE lxc_execute.c was taken as guidline and some lines where copied. */ +int main(int argc, char **argv) +{ + struct lxc_container *c = NULL; + int ret = EXIT_SUCCESS; + const char *name; + const char *lxcpath; + const char *rcfile; + + /* Ensure stdout and stderr are line bufferd. */ + setvbuf(stdout, NULL, _IOLBF, -1); + setvbuf(stderr, NULL, _IOLBF, -1); + errno = 0; + + if (argc != 4) + ERROR("invalid argument count, usage: " + "$0 \n"); + + /* + / If this is non interactive, get rid of our controlling terminal, + / since we don't want lxc's setting of ISIG to ignore user's ^Cs. + / Ignore any error - because controlling terminal could be a PTY. + */ + setsid(); + errno = 0; + + name = argv[1]; + lxcpath = argv[2]; + rcfile = argv[3]; + + c = lxc_container_new(name, lxcpath); + if (c == NULL) + ERROR("failed to create new container"); + + c->clear_config(c); + + if (!c->load_config(c, rcfile)) + ERROR("failed to load container config %s\n", rcfile); + + /* Do not daemonize - this would null the inherited stdio. */ + c->daemonize = false; + + if (!c->start(c, ENABLE_LXCINIT, NULL)) + ERROR("monitor process pid=%d failed (container error_num:%d)\n", getpid(), c->error_num); + + /* Try to die with the same signal the task did. */ + /* FIXME error_num is zero if init was killed with SIGHUP */ + if (WIFSIGNALED(c->error_num)) + kill(0, WTERMSIG(c->error_num)); + + if (WIFEXITED(c->error_num)) + ret = WEXITSTATUS(c->error_num); +out: + if (c != NULL) + lxc_container_put(c); + exit(ret); +} diff --git a/cmd/lxcri/cli.go b/cmd/lxcri/cli.go new file mode 100644 index 00000000..41deeadd --- /dev/null +++ b/cmd/lxcri/cli.go @@ -0,0 +1,641 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "text/template" + "time" + + "github.com/drachenfels-de/lxcri" + "github.com/drachenfels-de/lxcri/pkg/log" + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli/v2" +) + +var ( + // Environment variables are populated by default from this environment file. + // Existing environment variables are preserved. + envFile = "/etc/default/lxcri" + defaultLogFile = "/var/log/lxcri/lxcri.log" + version = "undefined" + libexecDir = "/usr/libexec/lxcri" +) + +type app struct { + lxcri.Runtime + cfg lxcri.ContainerConfig + + logConfig struct { + File *os.File + FilePath string + Level string + Timestamp string + } + + command string +} + +var clxc = app{} + +func (app *app) configureLogger() error { + // TODO use console logger if filepath is /dev/stdout or /dev/stderr ? + l, err := log.OpenFile(app.logConfig.FilePath, 0600) + if err != nil { + return fmt.Errorf("failed to open log file: %w", err) + } + app.logConfig.File = l + + level, err := log.ParseLevel(app.logConfig.Level) + if err != nil { + return fmt.Errorf("failed to parse log level: %w", err) + } + logCtx := log.NewLogger(app.logConfig.File, level) + app.Runtime.Log = logCtx.Str("cmd", app.command).Str("cid", app.cfg.ContainerID).Logger() + app.cfg.Log = app.Runtime.Log + + return nil +} + +func (app *app) release() error { + if app.logConfig.File != nil { + return app.logConfig.File.Close() + } + return nil +} + +func main() { + app := cli.NewApp() + app.Name = "lxcri" + app.Usage = "lxcri is a OCI compliant runtime wrapper for lxc" + app.Version = version + + // Disable the default ExitErrHandler. + // It will call os.Exit if a command returns an error that implements + // the cli.ExitCoder interface. E.g an unwrapped error from os.Exec. + app.ExitErrHandler = func(context *cli.Context, err error) {} + app.Commands = []*cli.Command{ + &stateCmd, + &createCmd, + &startCmd, + &killCmd, + &deleteCmd, + &execCmd, + &inspectCmd, + // TODO extend urfave/cli to render a default environment file. + } + + app.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "log-level", + Usage: "set the runtime log level (trace|debug|info|warn|error)", + EnvVars: []string{"LXCRI_LOG_LEVEL"}, + Value: "info", + Destination: &clxc.logConfig.Level, + }, + &cli.StringFlag{ + Name: "log-file", + Usage: "path to the log file for runtime and container output", + EnvVars: []string{"LXCRI_LOG_FILE"}, + Value: defaultLogFile, + Destination: &clxc.logConfig.FilePath, + }, + &cli.StringFlag{ + Name: "log-timestamp", + Usage: "timestamp format for the runtime log (see golang time package), default matches liblxc timestamp", + EnvVars: []string{"LXCRI_LOG_TIMESTAMP"}, // e.g '0102 15:04:05.000' + Destination: &clxc.logConfig.Timestamp, + }, + &cli.StringFlag{ + Name: "container-log-level", + Usage: "set the container process (liblxc) log level (trace|debug|info|notice|warn|error|crit|alert|fatal)", + EnvVars: []string{"LXCRI_CONTAINER_LOG_LEVEL"}, + Value: "warn", + Destination: &clxc.cfg.LogLevel, + }, + &cli.StringFlag{ + Name: "container-log-file", + Usage: "path to the log file for runtime and container output", + EnvVars: []string{"LXCRI_CONTAINER_LOG_FILE"}, + Value: defaultLogFile, + Destination: &clxc.cfg.LogFile, + }, + &cli.StringFlag{ + Name: "root", + Usage: "container runtime root where (logs, init and hook scripts). tmpfs is recommended.", + // exec permissions are not required because init is bind mounted into the root + Value: "/run/lxcri", + Destination: &clxc.Root, + }, + &cli.BoolFlag{ + Name: "systemd-cgroup", + Usage: "enable support for systemd encoded cgroup path", + Destination: &clxc.SystemdCgroup, + }, + &cli.StringFlag{ + Name: "monitor-cgroup", + Usage: "cgroup slice for liblxc monitor process and pivot path", + Destination: &clxc.MonitorCgroup, + EnvVars: []string{"LXCRI_MONITOR_CGROUP"}, + Value: "lxcri-monitor.slice", + }, + &cli.StringFlag{ + Name: "libexec", + Usage: "directory to load runtime executables from", + EnvVars: []string{"LXCRI_LIBEXEC"}, + Value: libexecDir, + Destination: &clxc.LibexecDir, + }, + &cli.BoolFlag{ + Name: "apparmor", + Usage: "set apparmor profile defined in container spec", + Destination: &clxc.Features.Apparmor, + EnvVars: []string{"LXCRI_APPARMOR"}, + Value: true, + }, + &cli.BoolFlag{ + Name: "capabilities", + Usage: "keep capabilities defined in container spec", + Destination: &clxc.Features.Capabilities, + EnvVars: []string{"LXCRI_CAPABILITIES"}, + Value: true, + }, + &cli.BoolFlag{ + Name: "cgroup-devices", + Usage: "allow only devices permitted by container spec", + Destination: &clxc.Features.CgroupDevices, + EnvVars: []string{"LXCRI_CGROUP_DEVICES"}, + Value: true, + }, + &cli.BoolFlag{ + Name: "seccomp", + Usage: "Generate and apply seccomp profile for lxc from container spec", + Destination: &clxc.Features.Seccomp, + EnvVars: []string{"LXCRI_SECCOMP"}, + Value: true, + }, + } + + startTime := time.Now() + + // Environment variables must be injected from file before app.Run() is called. + // Otherwise the values are not set to the crioLXC instance. + // FIXME when calling '--help' defaults are overwritten with environment variables. + // So you will never see the real default value if either an environment file is present + // or an environment variable is set. + env, err := loadEnvFile(envFile) + if err != nil { + println(err.Error()) + os.Exit(1) + } + for key, val := range env { + if err := setEnv(key, val, false); err != nil { + err = fmt.Errorf("failed to set environment variable \"%s=%s\": %w", key, val, err) + println(err.Error()) + os.Exit(1) + } + } + + app.CommandNotFound = func(ctx *cli.Context, cmd string) { + fmt.Fprintf(os.Stderr, "undefined subcommand %q cmdline%s\n", cmd, os.Args) + } + // Disable the default error messages for cmdline errors. + // By default the app/cmd help is printed to stdout, which produces garbage in cri-o log output. + // Instead the cmdline is printed to stderr to identify cmdline interface errors. + errUsage := func(context *cli.Context, err error, isSubcommand bool) error { + fmt.Fprintf(os.Stderr, "usage error %s: %s\n", err, os.Args) + return err + } + app.OnUsageError = errUsage + + app.Before = func(ctx *cli.Context) error { + clxc.command = ctx.Args().Get(0) + return nil + } + + setupCmd := func(ctx *cli.Context) error { + containerID := ctx.Args().Get(0) + if len(containerID) == 0 { + return fmt.Errorf("missing container ID") + } + clxc.cfg.ContainerID = containerID + + if err := clxc.configureLogger(); err != nil { + return fmt.Errorf("failed to configure logger: %w", err) + } + return nil + } + + for _, cmd := range app.Commands { + cmd.Before = setupCmd + cmd.OnUsageError = errUsage + } + + err = app.Run(os.Args) + + cmdDuration := time.Since(startTime) + + if err != nil { + clxc.Log.Error().Err(err).Dur("duration", cmdDuration).Msg("cmd failed") + clxc.release() + // write diagnostics message to stderr for crio/kubelet + println(err.Error()) + + // exit with exit status of executed command + var errExec execError + if errors.As(err, &errExec) { + os.Exit(errExec.exitStatus()) + } + os.Exit(1) + } + + clxc.Log.Debug().Dur("duration", cmdDuration).Msg("cmd completed") + if clxc.release(); err != nil { + println(err.Error()) + os.Exit(1) + } +} + +var createCmd = cli.Command{ + Name: "create", + Usage: "create a container from a bundle directory", + ArgsUsage: "", + Action: doCreate, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "bundle", + Usage: "set bundle directory", + Value: ".", + Destination: &clxc.cfg.BundlePath, + }, + &cli.StringFlag{ + Name: "console-socket", + Usage: "send container pty master fd to this socket path", + Destination: &clxc.cfg.ConsoleSocket, + }, + &cli.StringFlag{ + Name: "pid-file", + Usage: "path to write container PID", + }, + &cli.UintFlag{ + Name: "timeout", + Usage: "maximum duration in seconds for create to complete", + EnvVars: []string{"LXCRI_CREATE_TIMEOUT"}, + Value: 60, + }, + }, +} + +func doCreate(ctxcli *cli.Context) error { + if err := clxc.Init(); err != nil { + return err + } + specPath := filepath.Join(clxc.cfg.BundlePath, lxcri.BundleConfigFile) + spec, err := specki.ReadSpecJSON(specPath) + if err != nil { + return fmt.Errorf("failed to load container spec from bundle: %w", err) + } + clxc.cfg.Spec = spec + pidFile := ctxcli.String("pid-file") + + timeout := time.Duration(ctxcli.Uint("timeout")) * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + if err := doCreateInternal(ctx, pidFile); err != nil { + // Create a new context because create may fail with a timeout. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := clxc.Delete(ctx, clxc.cfg.ContainerID, true); err != nil { + clxc.Log.Error().Err(err).Msg("failed to destroy container") + } + return err + } + return nil +} + +func doCreateInternal(ctx context.Context, pidFile string) error { + c, err := clxc.Create(ctx, &clxc.cfg) + if err != nil { + return err + } + if pidFile != "" { + err := createPidFile(pidFile, c.Pid) + if err != nil { + return err + } + } + return c.Release() +} + +var startCmd = cli.Command{ + Name: "start", + Usage: "starts a container", + Action: doStart, + ArgsUsage: `[containerID] + +starts +`, + Flags: []cli.Flag{ + &cli.UintFlag{ + Name: "timeout", + Usage: "maximum duration in seconds for start to complete", + EnvVars: []string{"LXCRI_START_TIMEOUT"}, + Value: 30, + }, + }, +} + +func doStart(ctxcli *cli.Context) error { + + timeout := time.Duration(ctxcli.Uint("timeout")) * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + if err := doStartInternal(ctx); err != nil { + // a new context because start may fail with a timeout. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := clxc.Delete(ctx, clxc.cfg.ContainerID, true); err != nil { + clxc.Log.Error().Err(err).Msg("failed to destroy container") + } + return err + } + return nil +} + +func doStartInternal(ctx context.Context) error { + c, err := clxc.Load(clxc.cfg.ContainerID) + if err != nil { + return fmt.Errorf("failed to load container: %w", err) + } + + return clxc.Start(ctx, c) +} + +var stateCmd = cli.Command{ + Name: "state", + Usage: "returns state of a container", + Action: doState, + ArgsUsage: `[containerID] + + is the ID of the container you want to know about. +`, + Flags: []cli.Flag{}, +} + +func doState(unused *cli.Context) error { + c, err := clxc.Load(clxc.cfg.ContainerID) + if err != nil { + return fmt.Errorf("failed to load container: %w", err) + } + state, err := c.State() + if err != nil { + return err + } + j, err := json.Marshal(state.SpecState) + if err != nil { + return fmt.Errorf("failed to marshal json: %w", err) + } + clxc.Log.Trace().RawJSON("state", j).Msg("container state") + _, err = fmt.Fprint(os.Stdout, string(j)) + return err +} + +var killCmd = cli.Command{ + Name: "kill", + Usage: "sends a signal to a container", + Action: doKill, + ArgsUsage: `[containerID] [signal] + + is the ID of the container to send a signal to +[signal] signal name or numerical value (e.g [9|kill|KILL|sigkill|SIGKILL]) +`, + Flags: []cli.Flag{ + &cli.UintFlag{ + Name: "timeout", + Usage: "timeout for killing all processes in container cgroup", + EnvVars: []string{"LXCRI_KILL_TIMEOUT"}, + Value: 10, + }, + }, +} + +func doKill(ctxcli *cli.Context) error { + sig := ctxcli.Args().Get(1) + signum := parseSignal(sig) + if signum == 0 { + return fmt.Errorf("invalid signal param %q", sig) + } + + c, err := clxc.Load(clxc.cfg.ContainerID) + if err != nil { + return fmt.Errorf("failed to load container: %w", err) + } + + timeout := time.Duration(ctxcli.Uint("timeout")) * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + return clxc.Kill(ctx, c, signum) +} + +var deleteCmd = cli.Command{ + Name: "delete", + Usage: "deletes a container", + Action: doDelete, + ArgsUsage: `[containerID] + + is the ID of the container to delete +`, + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "force", + Usage: "force deletion", + }, + &cli.UintFlag{ + Name: "timeout", + Usage: "maximum duration in seconds for delete to complete", + EnvVars: []string{"LXCRI_DELETE_TIMEOUT"}, + Value: 10, + }, + }, +} + +func doDelete(ctxcli *cli.Context) error { + timeout := time.Duration(ctxcli.Uint("timeout")) * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + err := clxc.Delete(ctx, clxc.cfg.ContainerID, ctxcli.Bool("force")) + // Deleting a non-existing container is a noop, + // otherwise cri-o / kubelet log warnings about that. + if err == lxcri.ErrNotExist { + return nil + } + return err +} + +var execCmd = cli.Command{ + Name: "exec", + Usage: "execute a new process in a running container", + ArgsUsage: " [COMMAND] [args...]", + Action: doExec, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "process", + Aliases: []string{"p"}, + Usage: "path to process json - cmd and args are ignored if set", + Value: "", + }, + &cli.StringFlag{ + Name: "pid-file", + Usage: "file to write the process id to", + Value: "", + }, + &cli.BoolFlag{ + Name: "detach", + Aliases: []string{"d"}, + Usage: "detach from the executed process", + }, + }, +} + +type execError int + +func (e execError) exitStatus() int { + return int(e) +} + +func (e execError) Error() string { + // liblxc remaps execvp exit codes to shell exit codes. + // FIXME This is undocumented behaviour lxc/src/lxc/attach.c:lxc_attach_run_command + // https://github.com/lxc/go-lxc/blob/d1943fb48dc73ef5cbc0ef43ed585420f7b2eb3a/container.go#L1370 + // RunCommandStatus returns with exitCode 126 or 127 but without error, so it is not possible to determine + // whether this is the exit code from the command itself (e.g a shell itself) or from liblxc exec. + switch int(e) { + case 126: + return "can not execute file: file header not recognized" + case 127: + return "executable file not found in $PATH" + default: + return fmt.Sprintf("cmd execution failed with exit status %d", e.exitStatus()) + } +} + +func doExec(ctxcli *cli.Context) error { + var args []string + if ctxcli.Args().Len() > 1 { + args = ctxcli.Args().Slice()[1:] + } + + pidFile := ctxcli.String("pid-file") + detach := ctxcli.Bool("detach") + + if detach && pidFile == "" { + clxc.Log.Warn().Msg("detaching process but pid-file value is unset") + } + + procSpec, err := specki.LoadSpecProcess(ctxcli.String("process"), args) + if err != nil { + return err + } + + c, err := clxc.Load(clxc.cfg.ContainerID) + if err != nil { + return err + } + + if detach { + pid, err := c.ExecDetached(procSpec) + if err != nil { + return err + } + if pidFile != "" { + return createPidFile(pidFile, pid) + } + } else { + status, err := c.Exec(procSpec) + if err != nil { + return err + } + if status != 0 { + return execError(status) + } + } + return nil +} + +var inspectCmd = cli.Command{ + Name: "inspect", + Usage: "returns inspect of a container", + Action: doInspect, + ArgsUsage: `containerID [containerID...] + + [containerID...] list of IDs for container to inspect +`, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "template", + Usage: "Use this go template to to format output.", + }, + }, +} + +func doInspect(ctxcli *cli.Context) (err error) { + var t *template.Template + tmpl := ctxcli.String("template") + if tmpl != "" { + t, err = template.New("inspect").Parse(tmpl) + if err != nil { + return err + } + } + + for _, id := range ctxcli.Args().Slice() { + if err := inspectContainer(id, t); err != nil { + return err + } + } + return nil +} + +func inspectContainer(id string, t *template.Template) error { + c, err := clxc.Load(id) + if err != nil { + return fmt.Errorf("failed to load container: %w", err) + } + state, err := c.State() + if err != nil { + return fmt.Errorf("failed ot get container state: %w", err) + } + + info := struct { + Spec *specs.Spec + Container *lxcri.Container + State *lxcri.State + }{ + Spec: c.Spec, + Container: c, + State: state, + } + + if t != nil { + return t.Execute(os.Stdout, info) + } + + // avoid duplicate output + c.Spec = nil + state.SpecState.Annotations = nil + + j, err := json.MarshalIndent(info, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal json: %w", err) + } + _, err = fmt.Fprint(os.Stdout, string(j)) + return err +} diff --git a/cmd/lxcri/utils.go b/cmd/lxcri/utils.go new file mode 100644 index 00000000..9e3c622c --- /dev/null +++ b/cmd/lxcri/utils.go @@ -0,0 +1,95 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" +) + +func setEnv(key, val string, overwrite bool) error { + _, exist := os.LookupEnv(key) + if !exist || overwrite { + return os.Setenv(key, val) + } + return nil +} + +func loadEnvFile(envFile string) (map[string]string, error) { + // don't fail if environment file does not exist + _, err := os.Stat(envFile) + if os.IsNotExist(err) { + return nil, nil + } + if err != nil { + return nil, err + } + + // #nosec + data, err := os.ReadFile(envFile) + if err != nil { + return nil, err + } + lines := strings.Split(string(data), "\n") + env := make(map[string]string, len(lines)) + for n, line := range lines { + trimmed := strings.TrimSpace(line) + // skip over comments and blank lines + if len(trimmed) == 0 || trimmed[0] == '#' { + continue + } + vals := strings.SplitN(trimmed, "=", 2) + if len(vals) != 2 { + return nil, fmt.Errorf("invalid environment variable at line %s:%d", envFile, n+1) + } + key := strings.TrimSpace(vals[0]) + val := strings.Trim(strings.TrimSpace(vals[1]), `"'`) + env[key] = val + } + return env, nil +} + +func parseSignal(sig string) unix.Signal { + if sig == "" { + return unix.SIGTERM + } + // handle numerical signal value + if num, err := strconv.Atoi(sig); err == nil { + return unix.Signal(num) + } + + // gracefully handle all string variants e.g 'sigkill|SIGKILL|kill|KILL' + s := strings.ToUpper(sig) + if !strings.HasPrefix(s, "SIG") { + s = "SIG" + s + } + return unix.SignalNum(s) +} + +// createPidFile atomically creates a pid file for the given pid at the given path +func createPidFile(path string, pid int) error { + tmpDir := filepath.Dir(path) + tmpName := filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path))) + + // #nosec + f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0600) + if err != nil { + return fmt.Errorf("failed to create temporary PID file %q: %w", tmpName, err) + } + _, err = fmt.Fprintf(f, "%d", pid) + if err != nil { + return fmt.Errorf("failed to write to temporary PID file %q: %w", tmpName, err) + } + err = f.Close() + if err != nil { + return fmt.Errorf("failed to close temporary PID file %q: %w", tmpName, err) + } + err = os.Rename(tmpName, path) + if err != nil { + return fmt.Errorf("failed to rename temporary PID file %q to %q: %w", tmpName, path, err) + } + return nil +} diff --git a/cmd/lxcri/utils_test.go b/cmd/lxcri/utils_test.go new file mode 100644 index 00000000..447cb9e9 --- /dev/null +++ b/cmd/lxcri/utils_test.go @@ -0,0 +1,32 @@ +package main + +import ( + "testing" + + "golang.org/x/sys/unix" + + "github.com/stretchr/testify/require" +) + +func TestParseSignal(t *testing.T) { + sig := parseSignal("9") + require.Equal(t, unix.SIGKILL, sig) + + sig = parseSignal("kill") + require.Equal(t, unix.SIGKILL, sig) + + sig = parseSignal("sigkill") + require.Equal(t, unix.SIGKILL, sig) + + sig = parseSignal("KILL") + require.Equal(t, unix.SIGKILL, sig) + + sig = parseSignal("SIGKILL") + require.Equal(t, unix.SIGKILL, sig) + + sig = parseSignal("SIGNOTEXIST") + require.Equal(t, unix.Signal(0), sig) + + sig = parseSignal("66") + require.Equal(t, unix.Signal(66), sig) +} diff --git a/cmd/main.go b/cmd/main.go deleted file mode 100644 index 36a02dcb..00000000 --- a/cmd/main.go +++ /dev/null @@ -1,67 +0,0 @@ -package main - -import ( - "fmt" - "os" - - "github.com/apex/log" - "github.com/urfave/cli" -) - -var ( - version = "" - debug = false -) - -func main() { - app := cli.NewApp() - app.Name = "crio-lxc" - app.Usage = "crio-lxc is a CRI compliant runtime wrapper for lxc" - app.Version = version - app.Commands = []cli.Command{ - stateCmd, - createCmd, - startCmd, - killCmd, - deleteCmd, - } - - app.Flags = []cli.Flag{ - cli.BoolFlag{ - Name: "debug", - Usage: "enable debug mode", - }, - cli.StringFlag{ - Name: "log-level", - Usage: "set log level for LXC", - }, - cli.StringFlag{ - Name: "log-file", - Usage: "log file for LXC", - }, - cli.StringFlag{ - Name: "lxc-path, root", - Usage: "set the lxc path to use", - Value: "/var/lib/lxc", - }, - } - - app.Before = func(ctx *cli.Context) error { - LXC_PATH = ctx.String("lxc-path") - - debug = ctx.Bool("debug") - return nil - } - - log.SetLevel(log.InfoLevel) - - if err := app.Run(os.Args); err != nil { - format := "error: %v\n" - if debug { - format = "error: %+v\n" - } - - fmt.Fprintf(os.Stderr, format, err) - os.Exit(1) - } -} diff --git a/cmd/start.go b/cmd/start.go deleted file mode 100644 index 7e248a24..00000000 --- a/cmd/start.go +++ /dev/null @@ -1,88 +0,0 @@ -package main - -import ( - "fmt" - "os" - "path/filepath" - - "github.com/apex/log" - // "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/urfave/cli" - - lxc "gopkg.in/lxc/go-lxc.v2" -) - -var startCmd = cli.Command{ - Name: "start", - Usage: "starts a container", - Action: doStart, - ArgsUsage: `[containerID] - -starts -`, -} - -func checkHackyPreStart(c *lxc.Container) string { - hooks := c.ConfigItem("lxc.hook.pre-start") - for _, h := range hooks { - if h == "/bin/true" { - return "started" - } - } - return "prestart" -} - -func setHackyPreStart(c *lxc.Container) { - err := c.SetConfigItem("lxc.hook.pre-start", "/bin/true") - if err != nil { - log.Warnf("Failed to set \"container started\" indicator: %v", err) - } - err = c.SaveConfigFile(filepath.Join(LXC_PATH, c.Name(), "config")) - if err != nil { - log.Warnf("Failed to save \"container started\" indicator: %v", err) - } -} - -func doStart(ctx *cli.Context) error { - containerID := ctx.Args().Get(0) - if len(containerID) == 0 { - fmt.Fprintf(os.Stderr, "missing container ID\n") - cli.ShowCommandHelpAndExit(ctx, "state", 1) - } - - log.Infof("about to create container") - c, err := lxc.NewContainer(containerID, LXC_PATH) - if err != nil { - return errors.Wrap(err, "failed to load container") - } - defer c.Release() - log.Infof("checking if running") - if !c.Running() { - return fmt.Errorf("'%s' is not ready", containerID) - } - if checkHackyPreStart(c) == "started" { - return fmt.Errorf("'%s' already running", containerID) - } - log.Infof("not running, can start") - setHackyPreStart(c) - fifoPath := filepath.Join(LXC_PATH, containerID, "syncfifo") - log.Infof("opening fifo '%s'", fifoPath) - f, err := os.OpenFile(fifoPath, os.O_RDWR, 0) - if err != nil { - return errors.Wrap(err, "failed to open sync fifo") - } - - log.Infof("opened fifo, reading") - data := make([]byte, len(SYNC_FIFO_CONTENT)) - n, err := f.Read(data) - if err != nil { - return errors.Wrapf(err, "problem reading from fifo") - } - if n != len(SYNC_FIFO_CONTENT) || string(data) != SYNC_FIFO_CONTENT { - return errors.Errorf("bad fifo content: %s", string(data)) - } - - log.Infof("read '%s' from fifo, done", data) - return nil -} diff --git a/cmd/state.go b/cmd/state.go deleted file mode 100644 index c98918e9..00000000 --- a/cmd/state.go +++ /dev/null @@ -1,115 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "strconv" - "strings" - - // "github.com/apex/log" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/urfave/cli" - - lxc "gopkg.in/lxc/go-lxc.v2" -) - -var stateCmd = cli.Command{ - Name: "state", - Usage: "returns state of a container", - Action: doState, - ArgsUsage: `[containerID] - - is the ID of the container you want to know about. -`, - Flags: []cli.Flag{}, -} - -func doState(ctx *cli.Context) error { - containerID := ctx.Args().Get(0) - if len(containerID) == 0 { - fmt.Fprintf(os.Stderr, "missing container ID\n") - cli.ShowCommandHelpAndExit(ctx, "state", 1) - } - - exists, err := containerExists(containerID) - if err != nil { - return errors.Wrap(err, "failed to check if container exists") - } - if !exists { - return fmt.Errorf("container '%s' not found", containerID) - } - - c, err := lxc.NewContainer(containerID, LXC_PATH) - if err != nil { - return errors.Wrap(err, "failed to load container") - } - defer c.Release() - - if err := configureLogging(ctx, c); err != nil { - return errors.Wrap(err, "failed to configure logging") - - } - - status := "stopped" - pid := 0 - if c.Running() { - if checkHackyPreStart(c) == "started" { - status = "running" - } - pid = c.InitPid() - - // need to detect 'created' per - // https://github.com/opencontainers/runtime-spec/blob/v1.0.0-rc4/runtime.md#state - // it means "the container process has neither exited nor executed the user-specified program" - - // if cmd name of the child of the init pid starts with "/bin/sh /fifo-wait" then we can say it's 'created' - - procChildrenFilename := fmt.Sprintf("/proc/%d/task/%d/children", pid, pid) - childrenStr, err := ioutil.ReadFile(procChildrenFilename) - if err != nil { - return errors.Wrapf(err, "failed to read children from %s", procChildrenFilename) - } - children := strings.Split(strings.TrimSpace(string(childrenStr)), " ") - - if len(children) == 1 { - childPid, err := strconv.Atoi(children[0]) - if err != nil { - return errors.Wrapf(err, "failed to convert child pid") - } - procCmdlineFilename := fmt.Sprintf("/proc/%d/cmdline", childPid) - cmdline, err := ioutil.ReadFile(procCmdlineFilename) - if err != nil { - return errors.Wrapf(err, "failed to read cmdline from %s", procCmdlineFilename) - } - - cmdArgv := strings.Split(string(cmdline), "\x00") - if len(cmdArgv) > 2 && cmdArgv[0] == "/bin/sh" && cmdArgv[1] == "/fifo-wait" { - status = "created" - } - } - } - // bundlePath is the enclosing directory of the rootfs: - // https://github.com/opencontainers/runtime-spec/blob/v1.0.0-rc4/bundle.md - bundlePath := filepath.Dir(c.ConfigItem("lxc.rootfs.path")[0]) - annotations := map[string]string{} - s := specs.State{ - Version: CURRENT_OCI_VERSION, - ID: containerID, - Status: status, - Pid: pid, - Bundle: bundlePath, - Annotations: annotations, - } - - stateJson, err := json.Marshal(s) - if err != nil { - return errors.Wrap(err, "failed to marshal json") - } - fmt.Fprint(os.Stdout, string(stateJson)) - - return nil -} diff --git a/cmd/utils.go b/cmd/utils.go deleted file mode 100644 index aa3a45bc..00000000 --- a/cmd/utils.go +++ /dev/null @@ -1,88 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "os" - "os/exec" - "path/filepath" - "strings" - - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/urfave/cli" - - lxc "gopkg.in/lxc/go-lxc.v2" -) - -func readBundleSpec(specFilePath string) (spec *specs.Spec, err error) { - specFile, err := os.Open(specFilePath) - if err != nil { - return nil, errors.Wrapf(err, "failed to open spec file '%s'", specFilePath) - } - defer specFile.Close() - err = json.NewDecoder(specFile).Decode(&spec) - if err != nil { - return nil, errors.Wrapf(err, "failed to decode spec file") - } - - return spec, nil -} - -func configureLogging(ctx *cli.Context, c *lxc.Container) error { - if ctx.GlobalIsSet("log-level") { - var logLevel lxc.LogLevel - switch ctx.GlobalString("log-level") { - case "trace": - logLevel = lxc.TRACE - case "debug": - logLevel = lxc.DEBUG - case "info": - logLevel = lxc.INFO - case "warn": - logLevel = lxc.WARN - case "", "error": - logLevel = lxc.ERROR - default: - return fmt.Errorf("lxc driver config 'log_level' can only be trace, debug, info, warn or error") - } - c.SetLogLevel(logLevel) - } - - if ctx.GlobalIsSet("log-file") { - c.SetLogFile(ctx.GlobalString("log-file")) - } - return nil -} - -func pathExists(path string) (bool, error) { - _, err := os.Stat(path) - if err == nil { - return true, nil - } - if os.IsNotExist(err) { - return false, nil - } - return true, err -} - -func containerExists(containerID string) (bool, error) { - // check for container existence by looking for config file. - // otherwise NewContainer will return an empty container - // struct and we'll report wrong info - configExists, err := pathExists(filepath.Join(LXC_PATH, containerID, "config")) - if err != nil { - return false, errors.Wrap(err, "failed to check path existence of config") - } - - return configExists, nil -} - -func RunCommand(args ...string) error { - cmd := exec.Command(args[0], args[1:]...) - output, err := cmd.CombinedOutput() - if err != nil { - return errors.Errorf("%s: %s: %s", strings.Join(args, " "), err, string(output)) - } - return nil -} diff --git a/container.go b/container.go new file mode 100644 index 00000000..aae65c52 --- /dev/null +++ b/container.go @@ -0,0 +1,475 @@ +package lxcri + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/rs/zerolog" + "golang.org/x/sys/unix" + "gopkg.in/lxc/go-lxc.v2" +) + +// ContainerConfig is the configuration for a single Container instance. +type ContainerConfig struct { + // The Spec used to generate the liblxc config file. + // Any changes to the spec after creating the liblxc config file have no effect + // and should be avoided. + // NOTE The Spec must be serialized with the runtime config (lxcri.json) + // This is required because Spec.Annotations are required for Container.State() + // and spec.Namespaces are required for attach. + Spec *specs.Spec + + // ContainerID is the identifier of the container. + // The ContainerID is used as name for the containers runtime directory. + // The ContainerID must be unique at least through all containers of a runtime. + // The ContainerID should match the following pattern `[a-z][a-z0-9-_]+` + ContainerID string + + BundlePath string + ConsoleSocket string `json:",omitempty"` + + // PidFile is the absolute PID file path + // for the container monitor process (ExecStart) + MonitorCgroupDir string + + CgroupDir string + + // LogFile is the liblxc log file path + LogFile string + // LogLevel is the liblxc log level + LogLevel string + + // Log is the container Logger + Log zerolog.Logger `json:"-"` +} + +// ConfigFilePath returns the path to the liblxc config file. +func (c Container) ConfigFilePath() string { + return c.RuntimePath("config") +} + +func (c Container) syncFifoPath() string { + return c.RuntimePath("syncfifo") +} + +// RuntimePath returns the absolute path to the given sub path +// within the container root. +func (c Container) RuntimePath(subPath ...string) string { + return filepath.Join(c.runtimeDir, filepath.Join(subPath...)) +} + +// Container is the runtime state of a container instance. +type Container struct { + LinuxContainer *lxc.Container `json:"-"` + *ContainerConfig + + CreatedAt time.Time + // Pid is the process ID of the liblxc monitor process ( see ExecStart ) + Pid int + + runtimeDir string +} + +func (c *Container) create() error { + if err := os.MkdirAll(c.runtimeDir, 0777); err != nil { + return fmt.Errorf("failed to create container dir: %w", err) + } + + if err := os.Chmod(c.runtimeDir, 0777); err != nil { + return errorf("failed to chmod %s: %w", err) + } + + f, err := os.OpenFile(c.RuntimePath("config"), os.O_EXCL|os.O_CREATE|os.O_RDWR, 0640) + if err != nil { + return err + } + if err := f.Close(); err != nil { + return fmt.Errorf("failed to close empty config tmpfile: %w", err) + } + + c.LinuxContainer, err = lxc.NewContainer(c.ContainerID, filepath.Dir(c.runtimeDir)) + if err != nil { + return err + } + + return nil +} + +func (c *Container) load() error { + err := specki.DecodeJSONFile(c.RuntimePath("lxcri.json"), c) + if err != nil { + return fmt.Errorf("failed to load container config: %w", err) + } + + _, err = os.Stat(c.ConfigFilePath()) + if err != nil { + return fmt.Errorf("failed to load lxc config file: %w", err) + } + c.LinuxContainer, err = lxc.NewContainer(c.ContainerID, filepath.Dir(c.runtimeDir)) + if err != nil { + return fmt.Errorf("failed to create lxc container: %w", err) + } + + err = c.LinuxContainer.LoadConfigFile(c.ConfigFilePath()) + if err != nil { + return fmt.Errorf("failed to load config file: %w", err) + } + return nil +} + +func (c *Container) waitMonitorStopped(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + if !c.isMonitorRunning() { + return nil + } + time.Sleep(time.Millisecond * 100) + } + } +} + +func (c *Container) isMonitorRunning() bool { + if c.Pid < 2 { + return false + } + + var ws unix.WaitStatus + pid, err := unix.Wait4(c.Pid, &ws, unix.WNOHANG, nil) + if pid == c.Pid { + c.Log.Info().Msgf("monitor %d died: exited:%t exit_status:%d signaled:%t signal:%s", + c.Pid, ws.Exited(), ws.ExitStatus(), ws.Signaled(), ws.Signal()) + return false + } + + // if WNOHANG was specified and one or more child(ren) specified by pid exist, + // but have not yet changed state, then 0 is returned + if pid == 0 { + return true + } + + // This runtime process may not be the parent of the monitor process + if err == unix.ECHILD { + // check if the process is still runnning + err := unix.Kill(c.Pid, 0) + if err == nil { + return true + } + // it's not running + if err == unix.ESRCH { + return false + } + } + return false +} + +func (c *Container) waitCreated(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + if !c.isMonitorRunning() { + return fmt.Errorf("monitor already died") + } + state := c.LinuxContainer.State() + if !(state == lxc.RUNNING) { + c.Log.Debug().Stringer("state", state).Msg("wait for state lxc.RUNNING") + time.Sleep(time.Millisecond * 100) + continue + } + initState, err := c.getContainerInitState() + if err != nil { + return err + } + if initState == specs.StateCreated { + return nil + } + return fmt.Errorf("unexpected init state %q", initState) + } + } +} + +func (c *Container) waitStarted(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + if !c.isMonitorRunning() { + return fmt.Errorf("monitor already died") + } + initState, _ := c.getContainerInitState() + if initState != specs.StateCreated { + return nil + } + time.Sleep(time.Millisecond * 10) + } + } +} + +// State wraps specs.State and adds runtime specific state. +type State struct { + ContainerState string + RuntimePath string + SpecState specs.State +} + +// State returns the runtime state of the containers process. +// The State.Pid value is the PID of the liblxc +// container monitor process (lxcri-start). +func (c *Container) State() (*State, error) { + status, err := c.ContainerState() + if err != nil { + return nil, errorf("failed go get container status: %w", err) + } + + state := &State{ + ContainerState: c.LinuxContainer.State().String(), + RuntimePath: c.RuntimePath(), + SpecState: specs.State{ + Version: c.Spec.Version, + ID: c.ContainerID, + Bundle: c.RuntimePath(), + Pid: c.Pid, + Annotations: c.Spec.Annotations, + Status: status, + }, + } + + return state, nil +} + +// ContainerState returns the current state of the container process, +// as defined by the OCI runtime spec. +func (c *Container) ContainerState() (specs.ContainerState, error) { + return c.state(c.LinuxContainer.State()) +} + +func (c *Container) state(s lxc.State) (specs.ContainerState, error) { + switch s { + case lxc.STOPPED: + return specs.StateStopped, nil + case lxc.STARTING: + return specs.StateCreating, nil + case lxc.RUNNING, lxc.STOPPING, lxc.ABORTING, lxc.FREEZING, lxc.FROZEN, lxc.THAWED: + return c.getContainerInitState() + default: + return specs.StateStopped, fmt.Errorf("unsupported lxc container state %q", s) + } +} + +// getContainerInitState returns the detailed state of the container init process. +// This should be called if the container is in state lxc.RUNNING. +// On error the caller should call getContainerState() again +func (c *Container) getContainerInitState() (specs.ContainerState, error) { + initPid := c.LinuxContainer.InitPid() + if initPid < 1 { + return specs.StateStopped, nil + } + cmdlinePath := fmt.Sprintf("/proc/%d/cmdline", initPid) + cmdline, err := os.ReadFile(cmdlinePath) + // Ignore any error here. Most likely the error will be os.ErrNotExist. + // But I've seen race conditions where ESRCH is returned instead because + // the process has died while opening it's proc directory. + if err != nil { + if !(os.IsNotExist(err) || err == unix.ESRCH) { + c.Log.Warn().Str("file", cmdlinePath).Msgf("open failed: %s", err) + } + // init process died or returned + return specs.StateStopped, nil + } + if string(cmdline) == "/.lxcri/lxcri-init\000" { + return specs.StateCreated, nil + } + return specs.StateRunning, nil +} + +func (c *Container) kill(ctx context.Context, signum unix.Signal) error { + c.Log.Info().Int("signum", int(signum)).Msg("killing container process") + + // From `man pid_namespaces`: If the "init" process of a PID namespace terminates, the kernel + // terminates all of the processes in the namespace via a SIGKILL signal. + // NOTE: The liblxc monitor process `lxcri-start` doesn't propagate all signals to the init process, + // but handles some signals on its own. E.g SIGHUP tells the monitor process to hang up the terminal + // and terminate the init process with SIGTERM. + err := killCgroup(ctx, c, signum) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to kill group: %s", err) + } + return nil +} + +// GetConfigItem is a wrapper function and returns the +// first value returned by *lxc.Container.ConfigItem +func (c *Container) GetConfigItem(key string) string { + vals := c.LinuxContainer.ConfigItem(key) + if len(vals) > 0 { + first := vals[0] + // some lxc config values are set to '(null)' if unset eg. lxc.cgroup.dir + // TODO check if this is already fixed + if first != "(null)" { + return first + } + } + return "" +} + +// SetConfigItem is a wrapper for *lxc.Container.SetConfigItem. +// and only adds additional logging. +func (c *Container) SetConfigItem(key, value string) error { + err := c.LinuxContainer.SetConfigItem(key, value) + if err != nil { + return fmt.Errorf("failed to set config item '%s=%s': %w", key, value, err) + } + c.Log.Debug().Str(key, value).Msg("set config item") + return nil +} + +// SupportsConfigItem is a wrapper for *lxc.Container.IsSupportedConfig item. +func (c *Container) SupportsConfigItem(keys ...string) bool { + canCheck := lxc.VersionAtLeast(4, 0, 6) + if !canCheck { + c.Log.Warn().Msg("lxc.IsSupportedConfigItem is broken in liblxc < 4.0.6") + } + for _, key := range keys { + if canCheck && lxc.IsSupportedConfigItem(key) { + continue + } + c.Log.Info().Str("lxc.config", key).Msg("unsupported config item") + return false + } + return true +} + +// Release releases resources allocated by the container. +func (c *Container) Release() error { + return c.LinuxContainer.Release() +} + +func (c *Container) start(ctx context.Context) error { + // #nosec + fifo, err := os.OpenFile(c.syncFifoPath(), os.O_WRONLY, 0) + if err != nil { + return err + } + if err := fifo.Close(); err != nil { + return err + } + return c.waitStarted(ctx) +} + +// ExecDetached executes the given process spec within the container. +// The given process is started and the process PID is returned. +// It's up to the caller to wait for the process to exit using the returned PID. +// The container state must be either specs.StateCreated or specs.StateRunning +func (c *Container) ExecDetached(proc *specs.Process) (pid int, err error) { + opts, err := attachOptions(proc, c.Spec.Linux.Namespaces) + if err != nil { + return 0, errorf("failed to create attach options: %w", err) + } + + c.Log.Info().Strs("args", proc.Args). + Int("uid", opts.UID).Int("gid", opts.GID). + Ints("groups", opts.Groups).Msg("execute cmd") + + pid, err = c.LinuxContainer.RunCommandNoWait(proc.Args, opts) + if err != nil { + return pid, errorf("failed to run exec cmd detached: %w", err) + } + return pid, nil +} + +// Exec executes the given process spec within the container. +// It waits for the process to exit and returns its exit code. +// The container state must either be specs.StateCreated or specs.StateRunning +func (c *Container) Exec(proc *specs.Process) (exitStatus int, err error) { + opts, err := attachOptions(proc, c.Spec.Linux.Namespaces) + if err != nil { + return 0, errorf("failed to create attach options: %w", err) + } + exitStatus, err = c.LinuxContainer.RunCommandStatus(proc.Args, opts) + if err != nil { + return exitStatus, errorf("failed to run exec cmd: %w", err) + } + return exitStatus, nil +} + +func attachOptions(procSpec *specs.Process, ns []specs.LinuxNamespace) (lxc.AttachOptions, error) { + opts := lxc.AttachOptions{ + StdinFd: 0, + StdoutFd: 1, + StderrFd: 2, + } + + clone, err := cloneFlags(ns) + if err != nil { + return opts, err + } + opts.Namespaces = clone + + if procSpec != nil { + opts.Cwd = procSpec.Cwd + // Use the environment defined by the process spec. + opts.ClearEnv = true + opts.Env = procSpec.Env + + opts.UID = int(procSpec.User.UID) + opts.GID = int(procSpec.User.GID) + if n := len(procSpec.User.AdditionalGids); n > 0 { + opts.Groups = make([]int, n) + for i, g := range procSpec.User.AdditionalGids { + opts.Groups[i] = int(g) + } + } + } + return opts, nil +} + +func setLog(c *Container) error { + // Never let lxc write to stdout, stdout belongs to the container init process. + // Explicitly disable it - allthough it is currently the default. + c.LinuxContainer.SetVerbosity(lxc.Quiet) + // The log level for a running container is set, and may change, per runtime call. + err := c.LinuxContainer.SetLogLevel(parseContainerLogLevel(c.LogLevel)) + if err != nil { + return fmt.Errorf("failed to set container loglevel: %w", err) + } + if err := c.LinuxContainer.SetLogFile(c.LogFile); err != nil { + return fmt.Errorf("failed to set container log file: %w", err) + } + return nil +} + +func parseContainerLogLevel(level string) lxc.LogLevel { + switch strings.ToLower(level) { + case "trace": + return lxc.TRACE + case "debug": + return lxc.DEBUG + case "info": + return lxc.INFO + case "notice": + return lxc.NOTICE + case "warn": + return lxc.WARN + case "error": + return lxc.ERROR + case "crit": + return lxc.CRIT + case "alert": + return lxc.ALERT + case "fatal": + return lxc.FATAL + default: + return lxc.WARN + } +} diff --git a/create.go b/create.go new file mode 100644 index 00000000..51aa546c --- /dev/null +++ b/create.go @@ -0,0 +1,403 @@ +package lxcri + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Create creates a single container instance from the given ContainerConfig. +// Create is the first runtime method to call within the lifecycle of a container. +// A created Container must be released with Container.Release after use. +// You should call Runtime.Delete to cleanup container runtime state, even +// if the Create returned with an error. +func (rt *Runtime) Create(ctx context.Context, cfg *ContainerConfig) (*Container, error) { + if err := rt.checkConfig(cfg); err != nil { + return nil, err + } + + c := &Container{ContainerConfig: cfg} + c.runtimeDir = filepath.Join(rt.Root, c.ContainerID) + + if cfg.Spec.Annotations == nil { + cfg.Spec.Annotations = make(map[string]string) + } + cfg.Spec.Annotations["org.linuxcontainers.lxc.ConfigFile"] = c.RuntimePath("config") + + if err := c.create(); err != nil { + return c, errorf("failed to create container: %w", err) + } + + if err := configureContainer(rt, c); err != nil { + return c, errorf("failed to configure container: %w", err) + } + + cleanenv(c, true) + + // Seralize the modified spec.Spec separately, to make it available for + // runtime hooks. + specPath := c.RuntimePath(BundleConfigFile) + err := specki.EncodeJSONFile(specPath, cfg.Spec, os.O_EXCL|os.O_CREATE, 0444) + if err != nil { + return c, err + } + + err = specki.EncodeJSONFile(c.RuntimePath("hooks.json"), cfg.Spec.Hooks, os.O_EXCL|os.O_CREATE, 0444) + if err != nil { + return c, err + } + state, err := c.State() + if err != nil { + return c, err + } + err = specki.EncodeJSONFile(c.RuntimePath("state.json"), state.SpecState, os.O_EXCL|os.O_CREATE, 0444) + if err != nil { + return c, err + } + + if err := rt.runStartCmd(ctx, c); err != nil { + return c, errorf("failed to run container process: %w", err) + } + return c, nil +} + +func configureContainer(rt *Runtime, c *Container) error { + if err := setLog(c); err != nil { + return errorf("failed to configure container log: %w", err) + } + + if err := configureHostname(rt, c); err != nil { + return err + } + + if err := configureRootfs(rt, c); err != nil { + return fmt.Errorf("failed to configure rootfs: %w", err) + } + + if err := configureInit(rt, c); err != nil { + return fmt.Errorf("failed to configure init: %w", err) + } + + if os.Getuid() != 0 { + // ensure user namespace is enabled + if !isNamespaceEnabled(c.Spec, specs.UserNamespace) { + rt.Log.Warn().Msg("unprivileged runtime - enabling user namespace") + c.Spec.Linux.Namespaces = append(c.Spec.Linux.Namespaces, + specs.LinuxNamespace{Type: specs.UserNamespace}, + ) + } + } + if err := configureNamespaces(c); err != nil { + return fmt.Errorf("failed to configure namespaces: %w", err) + } + + if c.Spec.Process.OOMScoreAdj != nil { + if err := c.SetConfigItem("lxc.proc.oom_score_adj", fmt.Sprintf("%d", *c.Spec.Process.OOMScoreAdj)); err != nil { + return err + } + } + + if c.Spec.Process.NoNewPrivileges { + if err := c.SetConfigItem("lxc.no_new_privs", "1"); err != nil { + return err + } + } + + if rt.Features.Apparmor { + if err := configureApparmor(c); err != nil { + return fmt.Errorf("failed to configure apparmor: %w", err) + } + } else { + rt.Log.Warn().Msg("apparmor feature is disabled - profile is set to unconfined") + } + + if rt.Features.Seccomp { + if c.Spec.Linux.Seccomp != nil && len(c.Spec.Linux.Seccomp.Syscalls) > 0 { + profilePath := c.RuntimePath("seccomp.conf") + if err := writeSeccompProfile(profilePath, c.Spec.Linux.Seccomp); err != nil { + return err + } + if err := c.SetConfigItem("lxc.seccomp.profile", profilePath); err != nil { + return err + } + } + } else { + rt.Log.Warn().Msg("seccomp feature is disabled - all system calls are allowed") + } + + if rt.Features.Capabilities { + if err := configureCapabilities(c); err != nil { + return fmt.Errorf("failed to configure capabilities: %w", err) + } + } else { + rt.Log.Warn().Msg("capabilities feature is disabled - running with runtime privileges") + } + + // make sure autodev is disabled + if err := c.SetConfigItem("lxc.autodev", "0"); err != nil { + return err + } + + // NOTE crio can add devices (through the config) but this does not work for privileged containers. + // See https://github.com/cri-o/cri-o/blob/a705db4c6d04d7c14a4d59170a0ebb4b30850675/server/container_create_linux.go#L45 + // File an issue on cri-o (at least for support) + if err := specki.AllowEssentialDevices(c.Spec); err != nil { + return err + } + + if !rt.hasCapability("mknod") { + rt.Log.Info().Msg("runtime does not have capability CAP_MKNOD") + // CAP_MKNOD is not granted `man capabilities` + // Bind mount devices instead. + newMounts := make([]specs.Mount, 0, len(c.Spec.Mounts)+len(c.Spec.Linux.Devices)) + for _, m := range c.Spec.Mounts { + if m.Destination == "/dev" { + rt.Log.Info().Msg("removing old /dev mount") + continue + } + newMounts = append(newMounts, m) + } + c.Spec.Mounts = append(c.Spec.Mounts, + specs.Mount{ + Destination: "/dev", Source: "tmpfs", Type: "tmpfs", + Options: []string{"rw", "nosuid", "noexec", "relatime"}, + }, + ) + rt.Log.Info().Msg("bind mount devices") + for _, device := range c.Spec.Linux.Devices { + newMounts = append(newMounts, + specs.Mount{ + Destination: device.Path, Source: device.Path, Type: "bind", + Options: []string{"bind", "create=file"}, + }, + ) + } + c.Spec.Mounts = newMounts + c.Spec.Linux.Devices = nil + } + + if err := configureHooks(rt, c); err != nil { + return err + } + + if err := configureCgroup(rt, c); err != nil { + return fmt.Errorf("failed to configure cgroups: %w", err) + } + + for key, val := range c.Spec.Linux.Sysctl { + if err := c.SetConfigItem("lxc.sysctl."+key, val); err != nil { + return err + } + } + + // `man lxc.container.conf`: "A resource with no explicitly configured limitation will be inherited + // from the process starting up the container" + seenLimits := make([]string, 0, len(c.Spec.Process.Rlimits)) + for _, limit := range c.Spec.Process.Rlimits { + name := strings.TrimPrefix(strings.ToLower(limit.Type), "rlimit_") + for _, seen := range seenLimits { + if seen == name { + return fmt.Errorf("duplicate resource limit %q", limit.Type) + } + } + seenLimits = append(seenLimits, name) + val := fmt.Sprintf("%d:%d", limit.Soft, limit.Hard) + if err := c.SetConfigItem("lxc.prlimit."+name, val); err != nil { + return err + } + } + + if err := configureMounts(rt, c); err != nil { + return fmt.Errorf("failed to configure mounts: %w", err) + } + + if err := configureReadonlyPaths(c); err != nil { + return fmt.Errorf("failed to configure read-only paths: %w", err) + } + return nil +} + +func configureHostname(rt *Runtime, c *Container) error { + if c.Spec.Hostname == "" { + return nil + } + if err := c.SetConfigItem("lxc.uts.name", c.Spec.Hostname); err != nil { + return err + } + + // Check if UTS namespace is shared, but not with the host. + uts := getNamespace(c.Spec, specs.UTSNamespace) + if uts == nil { + return nil + } + + yes, err := isNamespaceSharedWithRuntime(uts) + if err != nil { + return errorf("failed to check if uts namespace is shared with host: %w", err) + } + if yes { + return nil + } + + // Set the hostname on shared UTS namespace, since liblxc doesn't do it. + if err := setHostname(uts.Path, c.Spec.Hostname); err != nil { + return fmt.Errorf("failed to set hostname: %w", err) + } + return nil +} + +func configureRootfs(rt *Runtime, c *Container) error { + rootfs := c.Spec.Root.Path + if !filepath.IsAbs(rootfs) { + rootfs = filepath.Join(c.BundlePath, rootfs) + } + if err := c.SetConfigItem("lxc.rootfs.path", rootfs); err != nil { + return err + } + + if err := c.SetConfigItem("lxc.rootfs.mount", rootfs); err != nil { + return err + } + + if err := c.SetConfigItem("lxc.rootfs.managed", "0"); err != nil { + return err + } + + // Resources not created by the container runtime MUST NOT be deleted by it. + if err := c.SetConfigItem("lxc.ephemeral", "0"); err != nil { + return err + } + + rootfsOptions := []string{} + if c.Spec.Linux.RootfsPropagation != "" { + rootfsOptions = append(rootfsOptions, c.Spec.Linux.RootfsPropagation) + } + if c.Spec.Root.Readonly { + rootfsOptions = append(rootfsOptions, "ro") + } + if err := c.SetConfigItem("lxc.rootfs.options", strings.Join(rootfsOptions, ",")); err != nil { + return err + } + return nil +} + +func configureReadonlyPaths(c *Container) error { + rootmnt := c.GetConfigItem("lxc.rootfs.mount") + if rootmnt == "" { + return fmt.Errorf("lxc.rootfs.mount unavailable") + } + for _, p := range c.Spec.Linux.ReadonlyPaths { + mnt := fmt.Sprintf("%s %s %s %s", filepath.Join(rootmnt, p), strings.TrimPrefix(p, "/"), "bind", "bind,ro,optional") + if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { + return fmt.Errorf("failed to make path readonly: %w", err) + } + } + return nil +} + +func configureApparmor(c *Container) error { + // The value *apparmor_profile* from crio.conf is used if no profile is defined by the container. + aaprofile := c.Spec.Process.ApparmorProfile + if aaprofile == "" { + aaprofile = "unconfined" + } + return c.SetConfigItem("lxc.apparmor.profile", aaprofile) +} + +// configureCapabilities configures the linux capabilities / privileges granted to the container processes. +// See `man lxc.container.conf` lxc.cap.drop and lxc.cap.keep for details. +// https://blog.container-solutions.com/linux-capabilities-in-practice +// https://blog.container-solutions.com/linux-capabilities-why-they-exist-and-how-they-work +func configureCapabilities(c *Container) error { + keepCaps := "none" + if c.Spec.Process.Capabilities != nil { + var caps []string + for _, c := range c.Spec.Process.Capabilities.Permitted { + lcCapName := strings.TrimPrefix(strings.ToLower(c), "cap_") + caps = append(caps, lcCapName) + } + if len(caps) > 0 { + keepCaps = strings.Join(caps, " ") + } + } + + return c.SetConfigItem("lxc.cap.keep", keepCaps) +} + +// NOTE keep in sync with cmd/lxcri-hook#ociHooksAndState +func configureHooks(rt *Runtime, c *Container) error { + + // prepend runtime OCI hooks to container hooks + hooks := rt.Hooks + + if c.Spec.Hooks != nil { + if len(c.Spec.Hooks.Prestart) > 0 { + hooks.Prestart = append(hooks.Prestart, c.Spec.Hooks.Prestart...) + } + if len(c.Spec.Hooks.CreateRuntime) > 0 { + hooks.CreateRuntime = append(hooks.CreateRuntime, c.Spec.Hooks.CreateRuntime...) + } + if len(c.Spec.Hooks.CreateContainer) > 0 { + hooks.CreateContainer = append(hooks.CreateContainer, c.Spec.Hooks.CreateContainer...) + } + if len(c.Spec.Hooks.StartContainer) > 0 { + hooks.StartContainer = append(hooks.StartContainer, c.Spec.Hooks.StartContainer...) + } + if len(c.Spec.Hooks.Poststart) > 0 { + hooks.Poststart = append(hooks.Poststart, c.Spec.Hooks.Poststart...) + } + if len(c.Spec.Hooks.Poststop) > 0 { + hooks.Poststop = append(hooks.Poststop, c.Spec.Hooks.Poststop...) + } + } + + c.Spec.Hooks = &hooks + + // pass context information as environment variables to hook scripts + if err := c.SetConfigItem("lxc.hook.version", "1"); err != nil { + return err + } + + if len(c.Spec.Hooks.Prestart) > 0 || len(c.Spec.Hooks.CreateRuntime) > 0 { + if err := c.SetConfigItem("lxc.hook.pre-mount", rt.libexec(ExecHook)); err != nil { + return err + } + } + if len(c.Spec.Hooks.CreateContainer) > 0 { + if err := c.SetConfigItem("lxc.hook.mount", rt.libexec(ExecHook)); err != nil { + return err + } + } + if len(c.Spec.Hooks.StartContainer) > 0 { + if err := c.SetConfigItem("lxc.hook.start", rt.libexec(ExecHook)); err != nil { + return err + } + } + return nil +} + +// cleanenv removes duplicates from spec.Process.Env. +// If overwrite is false the first defined value takes precedence, +// if overwrite is true, the last defined value overwrites previously +// defined values. +func cleanenv(c *Container, overwrite bool) { + env := c.Spec.Process.Env + if len(env) < 2 { + return + } + newEnv := make([]string, 0, len(env)) + var exist bool + for _, kv := range env { + newEnv, exist = specki.Setenv(newEnv, kv, overwrite) + if exist { + vals := strings.Split(kv, "=") + c.Log.Warn().Msgf("duplicate environment variable %s (overwrite=%t)", vals[0], overwrite) + } + } + c.Spec.Process.Env = newEnv +} diff --git a/doc/install.md b/doc/install.md new file mode 100644 index 00000000..c978bd34 --- /dev/null +++ b/doc/install.md @@ -0,0 +1,149 @@ +## cgroups + +Enable cgroupv2 unified hierarchy manually: + +``` +mount -t cgroup2 none /sys/fs/cgroup +``` + +or permanent via kernel cmdline params: + + ``` + systemd.unified_cgroup_hierarchy=1 cgroup_no_v1=all + ``` + +## build dependencies + +Install the build dependencies which are required to build the runtime and runtime dependencies. + +### debian + +```sh +# liblxc / conmon build dependencies +apt-get install build-essential libtool automake pkg-config \ +libseccomp-dev libapparmor-dev libbtrfs-dev \ +libdevmapper-dev libcap-dev libc6-dev libglib2.0-dev +# k8s dependencies, tools +apt-get install jq ebtables iptables conntrack +``` + +### arch linux + +```sh +# liblxc / conmon build dependencies +pacman -Sy base-devel apparmor libseccomp libpcap btrfs-progs +# k8s dependencies +pacman -Sy conntrack-tools ebtables jq +``` + +## runtime dependencies + +* [lxc](https://github.com/lxc/lxc.git) >= b5daeddc5afce1cad4915aef3e71fdfe0f428709 +* [conmon/pinns](https://github.com/containers/conmon.git) v2.0.22 +* [cri-o](https://github.com/cri-o/cri-o.git) release-1.20 + +By default everything is installed to `/usr/local` + +### lxc (liblxc) + +```sh +git clone https://github.com/lxc/lxc.git +cd lxc +./autogen.sh +./configure --enable-bash=no --enable-seccomp=yes \ + --enable-capabilities=yes --enable-apparmor=yes +make install + +git describe --tags > /usr/local/lib/liblxc.version.txt +echo /usr/local/lib > /etc/ld.so.conf.d/local.conf +ldconfig +``` + +### lxcri + +``` +make install +``` + +The installation prefix environment variable is set to `PREFIX=/usr/local` by default.
+The library source path for `pkg-config` is set to `$PREFIX/lib/pkg-config` by default.
+You can change that by setting the `PKG_CONFIG_PATH` environment variable.
+ +E.g to install binaries in `/opt/bin` but use liblxc from `/usr/lib`: + + PREFIX=/opt PKG_CONFIG_PATH=/usr/lib/pkgconfig make install + +Keep in mind that you have to change the `INSTALL_PREFIX` in the crio install script below. + +### conmon + +```sh +git clone https://github.com/containers/conmon.git +cd conmon +git reset --hard v2.0.22 +make clean +make install +``` + +### cri-o + +```sh +#!/bin/sh +git clone https://github.com/cri-o/cri-o.git +cd cri-o +git reset --hard origin/release-1.20 +make install + +PREFIX=/usr/local +CRIO_LXC_ROOT=/run/lxcri + +# environment for `crio config` +export CONTAINER_CONMON=${PREFIX}/bin/conmon +export CONTAINER_PINNS_PATH=${PREFIX}/bin/pinns +export CONTAINER_DEFAULT_RUNTIME=lxcri +export CONTAINER_RUNTIMES=lxcri:${PREFIX}/bin/lxcri:$CRIO_LXC_ROOT + +crio config > /etc/crio/crio.conf +``` + +#### cgroupv2 ebpf + +Modify systemd service file to run with full privileges.
+This is required for the runtime to set cgroupv2 device controller eBPF.
+See https://github.com/cri-o/cri-o/pull/4272 + +``` +sed -i 's/ExecStart=\//ExecStart=+\//' /usr/local/lib/systemd/system/crio.service +systemctl daemon-reload +systemctl start crio +``` + +#### storage configuration + +If you're using `overlay` as storage driver cri-o may complain that it is not using `native diff` mode.
+Update `/etc/containers/storage.conf` to fix this. + +``` +# see https://github.com/containers/storage/blob/v1.20.2/docs/containers-storage.conf.5.md +[storage] +driver = "overlay" + +[storage.options.overlay] +# see https://www.kernel.org/doc/Documentation/filesystems/overlayfs.txt, `modinfo overlay` +# [ 8270.526807] overlayfs: conflicting options: metacopy=on,redirect_dir=off +# NOTE: metacopy can only be enabled when redirect_dir is enabled +# NOTE: storage driver name must be set or mountopt are not evaluated, +# even when the driver is the default driver --> BUG ? +mountopt = "nodev,redirect_dir=off,metacopy=off" +``` + +#### HTTP proxy + +If you need a HTTP proxy for internet access you may have to set the proxy environment variables in `/etc/default/crio` +for crio-o to be able to fetch images from remote repositories. + +``` +http_proxy="http://myproxy:3128" +https_proxy="http://myproxy:3128" +no_proxy="10.0.0.0/8,172.16.0.0/12,192.168.0.0/16,127.0.0.0/8,127.0.0.1,localhost" +``` diff --git a/doc/kubernetes.md b/doc/kubernetes.md new file mode 100644 index 00000000..717cacec --- /dev/null +++ b/doc/kubernetes.md @@ -0,0 +1,157 @@ +## kubernetes + +The following skript downloads kubernetes [v1.20.2](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.20.md#v1202) and installs it to `/usr/local/bin`.
+You have to create the `kubelet.service` and `10-kubeadm.conf` before running the script. + +```sh +#!/bin/sh +# about: installs kubeadm,kubectl and kubelet to /usr/local/bin +# installs systemd service to /etc/systemd/system + + +# Upgrade process: +# * change RELEASE and CHECKSUM +# * remove downloaded archive file +# * run this script again + +ARCH="linux-amd64" +RELEASE="1.20.2" +ARCHIVE=kubernetes-server-$ARCH.tar.gz +CHECKSUM="65abf178782e43bc21e8455ffbfdadf6064dbeae3ff704ccf9e13e8acee18235c280b06778e5de4bd702f5507e1870fe38c561366d125ef4f821ed7aa46e9f45" +DESTDIR="/usr/local/bin" + +[ -e "$ARCHIVE" ] || wget https://dl.k8s.io/v$RELEASE/$ARCHIVE + +echo "$CHECKSUM $ARCHIVE" | sha512sum -c || exit 1 + +tar -x -z -f $ARCHIVE -C $DESTDIR --strip-components=3 kubernetes/server/bin/kubectl kubernetes/server/bin/kubeadm kubernetes/server/bin/kubelet +install -v kubelet.service /etc/systemd/system/ +install -v -D 10-kubeadm.conf /etc/systemd/system/kubelet.service.d/10-kubeadm.conf +systemctl daemon-reload +``` + +### systemd service + +**kubelet.service** +``` +[Unit] +Description=kubelet: The Kubernetes Node Agent +Documentation=http://kubernetes.io/docs/ + +[Service] +ExecStart=/usr/local/bin/kubelet +Restart=always +StartLimitInterval=0 +RestartSec=10 + +[Install] +WantedBy=multi-user.target +``` + +**10-kubeadm.conf** +``` +# Note: This dropin only works with kubeadm and kubelet v1.11+ +[Service] +Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf" +Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml" +# This is a file that "kubeadm init" and "kubeadm join" generate at runtime, populating the KUBELET_KUBEADM_ARGS variable dynamically +EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env +# This is a file that the user can use for overrides of the kubelet args as a last resort. Preferably, the user should use +# the .NodeRegistration.KubeletExtraArgs object in the configuration files instead. KUBELET_EXTRA_ARGS should be sourced from this file. +EnvironmentFile=-/etc/default/kubelet +ExecStart= +ExecStart=/usr/local/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS +``` + +### kubeadm init + +This initializes the kubernetes control-plane. + +* Replace `HOSTIP` and `HOSTNAME` variables in `cluster-init.yaml` and initialize the cluster: + +``` +kubeadm init --config cluster-init.yaml -v 5 +# for single node cluster remove taint +taint remove kubectl taint nodes --all node-role.kubernetes.io/master- +``` + + * Install a networking plugin (I'm using [calico](https://www.projectcalico.org)) + +**cluster-init.yaml** +```yaml +apiVersion: kubeadm.k8s.io/v1beta2 +kind: InitConfiguration +localAPIEndpoint: + advertiseAddress: {HOSTIP} + bindPort: 6443 +nodeRegistration: + name: {HOSTNAME} + criSocket: unix://var/run/crio/crio.sock + taints: + - effect: NoSchedule + key: node-role.kubernetes.io/master +# kubeletExtraArgs: +# v: "5" +--- +apiVersion: kubelet.config.k8s.io/v1beta1 +kind: KubeletConfiguration +cgroupDriver: systemd +--- +kind: ClusterConfiguration +kubernetesVersion: v1.20.2 +apiVersion: kubeadm.k8s.io/v1beta2 +apiServer: + timeoutForControlPlane: 4m0s +certificatesDir: /etc/kubernetes/pki +clusterName: kubernetes +controllerManager: {} +dns: + type: CoreDNS +etcd: + local: + dataDir: /var/lib/etcd +imageRepository: k8s.gcr.io +networking: + dnsDomain: cluster.local + serviceSubnet: 10.96.0.0/12 + podSubnet: 10.66.0.0/16 +scheduler: {} +controlPlaneEndpoint: "${HOSTIP}:6443" +``` + +#### preflight issues + +There are some `preflight` checks that might fail once you start kubeadm. + +##### install cri-tools + +``` +[ERROR FileExisting-crictl]: crictl not found in system path +``` + +Please install the `cri-tools` from https://github.com/kubernetes-sigs/cri-tools/releases to your `PATH` e.g [cri-tools v1.20.0](https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.20.0/crictl-v1.20.0-linux-amd64.tar.gz) + + +##### load br-netfilter + +``` +[ERROR FileContent--proc-sys-net-bridge-bridge-nf-call-iptables]: /proc/sys/net/bridge/bridge-nf-call-iptables does not exist +``` + +You must load the `br-netfilter` kernel module. To do that automatically on startup add it to `/etc/modules-load.d` e.g : + +``` +echo 'br-netfilter' > /etc/modules-load.d/kubelet.conf +``` + +##### enable IP forwarding +``` +[ERROR FileContent--proc-sys-net-ipv4-ip_forward]: /proc/sys/net/ipv4/ip_forward contents are not set to 1 +```` + +IP forwarding must be enabled. E.g + +``` +echo 'net.ipv4.ip_forward=1' > /etc/sysctl.d/99-kubelet.conf +sysctl --system +``` diff --git a/doc/setup.md b/doc/setup.md new file mode 100644 index 00000000..b6a4b3f9 --- /dev/null +++ b/doc/setup.md @@ -0,0 +1,111 @@ +## Glossary + +* `runtime` the lxcri binary and the command set that implement the [OCI runtime spec](https://github.com/opencontainers/runtime-spec/releases/download/v1.0.2/oci-runtime-spec-v1.0.2.html) +* `container process` the process that starts and runs the container using liblxc (lxcri-start) +* `container config` the LXC config file +* `bundle config` the lxcri container state (bundle path, pidfile ....) +* `runtime spec` the OCI runtime spec from the bundle + +## Setup + +The runtime binary implements flags that are required by the `OCI runtime spec`,
+and flags that are runtime specific (timeouts, hooks, logging ...). + +Most of the runtime specific flags have corresponding environment variables. See `lxcri --help`.
+The runtime evaluates the flag value in the following order (lower order takes precedence). + +1. cmdline flag from process arguments (overwrites process environment) +2. process environment variable (overwrites environment file) +3. environment file (overwrites cmdline flag default) +4. cmdline flag default + +### Environment variables + +Currently you have to compile to environment file yourself.
+To list all available variables: + +``` +grep EnvVars cmd/cli.go | grep -o LXCRI_[A-Za-z_]* | xargs -n1 -I'{}' echo "#{}=" +``` + +### Environment file + +The default path to the environment file is `/etc/defaults/lxcri`.
+It is loaded on every start of the `lxcri` binary, so changes take immediate effect.
+Empty lines and those commented with a leading *#* are ignored.
+ +A malformed environment will let the next runtime call fail.
+In production it's recommended that you replace the environment file atomically.
+ +E.g the environment file `/etc/default/lxcri` could look like this: + +```sh +LXCRI_LOG_LEVEL=debug +LXCRI_CONTAINER_LOG_LEVEL=debug +#LXCRI_LOG_FILE= +#LXCRI_LOG_TIMESTAMP= +#LXCRI_MONITOR_CGROUP= +#LXCRI_LIBEXEC= +#LXCRI_APPARMOR= +#LXCRI_CAPABILITIES= +#LXCRI_CGROUP_DEVICES= +#LXCRI_SECCOMP= +#LXCRI_CREATE_TIMEOUT= +#LXCRI_CREATE_HOOK=/usr/local/bin/lxcri-backup.sh +#LXCRI_CREATE_HOOK_TIMEOUT= +#LXCRI_START_TIMEOUT= +#LXCRI_KILL_TIMEOUT= +#LXCRI_DELETE_TIMEOUT= +``` + +### Runtime (security) features + +All supported runtime security features are enabled by default.
+The following runtime (security) features can optionally be disabled.
+Details see `lxcri --help` + +* apparmor +* capabilities +* cgroup-devices +* seccomp + +### Logging + +There is only a single log file for runtime and container process log output.
+The log-level for the runtime and the container process can be set independently. + +* containers are ephemeral, but the log file should not be +* a single logfile is easy to rotate and monitor +* a single logfile is easy to tail (watch for errors / events ...) +* robust implementation is easy + +#### Log Filtering + +Runtime log lines are written in JSON using [zerolog](https://github.com/rs/zerolog).
+The log file can be easily filtered with [jq](https://stedolan.github.io/jq/).
+For filtering with `jq` you must strip the container process logs with `grep -v '^lxc'`
+ +E.g Filter show only errors and warnings for runtime `create` command: + +```sh + grep -v '^lxc ' /var/log/lxcri.log |\ + jq -c 'select(.cmd == "create" and ( .l == "error or .l == "warn")' +``` + +#### Runtime log fields + +Fields that are always present: + +* `l` log level +* `m` log message +* `c` caller (source file and line number) +* `cid` container ID +* `cmd` runtime command +* `t` timestamp in UTC (format matches container process output) + +### Debugging + +Apart from the logfile following resources are useful: + +* Systemd journal for cri-o and kubelet services +* `coredumpctl` if runtime or container process segfaults. diff --git a/go.mod b/go.mod index 7f80ccc2..4b5a8199 100644 --- a/go.mod +++ b/go.mod @@ -1,25 +1,22 @@ -module github.com/lxc/crio-lxc +module github.com/drachenfels-de/lxcri require ( - github.com/apex/log v1.1.1 - github.com/aws/aws-sdk-go v1.20.8 // indirect - github.com/golang/protobuf v1.3.1 // indirect - github.com/onsi/ginkgo v1.8.0 // indirect - github.com/opencontainers/runtime-spec v1.0.1 - github.com/pkg/errors v0.8.1 - github.com/stretchr/objx v0.2.0 // indirect - github.com/urfave/cli v1.20.0 - golang.org/x/crypto v0.0.0-20190621222207-cc06ce4a13d4 // indirect - golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0 - golang.org/x/text v0.3.2 // indirect - golang.org/x/tools v0.0.0-20190625160430-252024b82959 // indirect - gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect - gopkg.in/lxc/go-lxc.v2 v2.0.0-20190625173123-f4822c6bba64 - gopkg.in/yaml.v2 v2.2.2 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect + github.com/creack/pty v1.1.11 + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/drachenfels-de/gocapability v0.0.0-20210413092208-755d79b01352 + github.com/kr/pretty v0.2.1 // indirect + github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d + github.com/rs/zerolog v1.20.0 + github.com/stretchr/testify v1.6.1 + github.com/urfave/cli/v2 v2.3.0 + golang.org/x/sys v0.0.0-20210228012217-479acdf4ea46 + gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect + gopkg.in/lxc/go-lxc.v2 v2.0.0-20210205143421-c4b883be4881 ) -replace github.com/vbatts/go-mtree v0.4.4 => github.com/vbatts/go-mtree v0.4.5-0.20190122034725-8b6de6073c1a +replace golang.org/x/crypto => golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad -replace github.com/openSUSE/umoci v0.4.4 => github.com/tych0/umoci v0.1.1-0.20190402232331-556620754fb1 +replace golang.org/x/text => golang.org/x/text v0.3.3 -go 1.13 +go 1.16 diff --git a/go.sum b/go.sum index cc4071c6..a0aa7537 100644 --- a/go.sum +++ b/go.sum @@ -1,77 +1,55 @@ -github.com/apex/log v1.1.1 h1:BwhRZ0qbjYtTob0I+2M+smavV0kOC8XgcnGZcyL9liA= -github.com/apex/log v1.1.1/go.mod h1:Ls949n1HFtXfbDcjiTTFQqkVUrte0puoIBfO3SVgwOA= -github.com/aphistic/golf v0.0.0-20180712155816-02c07f170c5a/go.mod h1:3NqKYiepwy8kCu4PNA+aP7WUV72eXWJeP9/r3/K9aLE= -github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3stzu0Xys= -github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= -github.com/aws/aws-sdk-go v1.20.8/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= -github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/cpuguy83/go-md2man/v2 v2.0.0 h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM= +github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/creack/pty v1.1.11 h1:07n33Z8lZxZ2qwegKbObQohDhXDQxiMMz1NOUGYlesw= +github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= -github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= -github.com/jpillora/backoff v0.0.0-20180909062703-3050d21c67d7/go.mod h1:2iMrUgbbvHEiQClaW2NsSzMyGHqN+rDFqY705q49KG0= -github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= -github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= -github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= -github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= -github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.8.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= -github.com/opencontainers/runtime-spec v1.0.1 h1:wY4pOY8fBdSIvs9+IDHC55thBuEulhzfSgKeC1yFvzQ= -github.com/opencontainers/runtime-spec v1.0.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= -github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= +github.com/drachenfels-de/gocapability v0.0.0-20210413092208-755d79b01352 h1:Qx+y7zFy52uzSTCYC3gUGHdbXkaY3ypP9bvgIjOlhfw= +github.com/drachenfels-de/gocapability v0.0.0-20210413092208-755d79b01352/go.mod h1:BhJFa1j1CrR5IPQo8i5+93q+HAAN2gaJDmNMLL3cPAU= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d h1:pNa8metDkwZjb9g4T8s+krQ+HRgZAkqnXml+wNir/+s= +github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= -github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= -github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM= -github.com/smartystreets/gunit v1.0.0/go.mod h1:qwPWnhz6pn0NnRBP++URONOVyNkPyr4SauJk4cUOwJs= +github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ= +github.com/rs/zerolog v1.20.0 h1:38k9hgtUBdxFwE34yS8rTHmHBa4eN16E4DJlv177LNs= +github.com/rs/zerolog v1.20.0/go.mod h1:IzD0RJ65iWH0w97OQQebJEvTZYvsCUm9WVLWBQrJRjo= +github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= +github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= +github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= -github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= -github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= -github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= -github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKwh4= -github.com/urfave/cli v1.20.0 h1:fDqGv3UG/4jbVl/QkFwEdddtEDjh/5Ov6X+0B/3bPaw= -github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190621222207-cc06ce4a13d4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3 h1:0GoQqolDA55aaLxZyTzK/Y2ePZzZTUrRacwib7cNsYQ= +github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/urfave/cli/v2 v2.3.0 h1:qph92Y649prgesehzOrQjdWyxFOp/QVM+6imKHad91M= +github.com/urfave/cli/v2 v2.3.0/go.mod h1:LJmUH05zAU44vOAcrfzZQKsZbVcdbOG8rtL3/XcUArI= +golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0 h1:HyfiK1WMnHj5FXFXatD+Qs1A/xC2Run6RzeW1SyHxpc= -golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210228012217-479acdf4ea46 h1:V066+OYJ66oTjnhm4Yrn7SXIwSCiDQJxpBxmvqb1N1c= +golang.org/x/sys v0.0.0-20210228012217-479acdf4ea46/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190625160430-252024b82959/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190828213141-aed303cbaa74/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= -gopkg.in/lxc/go-lxc.v2 v2.0.0-20190625173123-f4822c6bba64 h1:DU3NyssIxuVNRsE9piaPVHesR/f6KDI+la3s1eQAkTY= -gopkg.in/lxc/go-lxc.v2 v2.0.0-20190625173123-f4822c6bba64/go.mod h1:4K0lbUXeslpmjwJZyW1lI6s5j97mrsj4+kpYwwvuLXo= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/lxc/go-lxc.v2 v2.0.0-20210205143421-c4b883be4881 h1:YcCjv1g/OoEJ93hK3p+5MhPyuIMD9FwOYF5f4D7rNKk= +gopkg.in/lxc/go-lxc.v2 v2.0.0-20210205143421-c4b883be4881/go.mod h1:4K0lbUXeslpmjwJZyW1lI6s5j97mrsj4+kpYwwvuLXo= +gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/init.go b/init.go new file mode 100644 index 00000000..5f6a6607 --- /dev/null +++ b/init.go @@ -0,0 +1,124 @@ +package lxcri + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +func createFifo(dst string, mode uint32) error { + if err := unix.Mkfifo(dst, mode); err != nil { + return errorf("mkfifo dst:%s failed: %w", dst, err) + } + // lxcri-init must be able to write to the fifo. + // Init process UID/GID can be different from runtime process UID/GID + // liblxc changes the owner of the runtime directory to the effective container UID. + // access to the files is protected by the runtimeDir + // because umask (0022) affects unix.Mkfifo, a separate chmod is required + // FIXME if container UID equals os.GetUID() and spec. + if err := unix.Chmod(dst, mode); err != nil { + return errorf("chmod mkfifo failed: %w", err) + } + return nil +} + +// runAsRuntimeUser returns true if container process is started as runtime user. +func runAsRuntimeUser(spec *specs.Spec) bool { + puid := specki.UnmapContainerID(spec.Process.User.UID, spec.Linux.UIDMappings) + return puid == uint32(os.Getuid()) +} + +func configureInit(rt *Runtime, c *Container) error { + initDir := "/.lxcri" + + c.Spec.Mounts = append(c.Spec.Mounts, specs.Mount{ + Source: c.RuntimePath(), + Destination: strings.TrimLeft(initDir, "/"), + Type: "bind", + Options: []string{"bind", "ro", "nodev", "nosuid", "create=dir"}, + }) + + if err := c.SetConfigItem("lxc.init.cwd", initDir); err != nil { + return err + } + + if runAsRuntimeUser(c.Spec) { + if err := createFifo(c.syncFifoPath(), 0600); err != nil { + return fmt.Errorf("failed to create sync fifo: %w", err) + } + } else { + if err := createFifo(c.syncFifoPath(), 0666); err != nil { + return fmt.Errorf("failed to create sync fifo: %w", err) + } + } + + if err := configureInitUser(c); err != nil { + return err + } + + // bind mount lxcri-init into the container + initCmdPath := c.RuntimePath("lxcri-init") + err := touchFile(initCmdPath, 0) + if err != nil { + return fmt.Errorf("failed to create %s: %w", initCmdPath, err) + } + initCmd := filepath.Join(initDir, "lxcri-init") + c.Spec.Mounts = append(c.Spec.Mounts, specs.Mount{ + Source: rt.libexec(ExecInit), + Destination: strings.TrimLeft(initCmd, "/"), + Type: "bind", + Options: []string{"bind", "ro", "nosuid"}, + }) + return c.SetConfigItem("lxc.init.cmd", initCmd) +} + +func touchFile(filePath string, perm os.FileMode) error { + // #nosec + f, err := os.OpenFile(filePath, os.O_CREATE|os.O_RDONLY, perm) + if err == nil { + return f.Close() + } + return err +} + +func configureInitUser(c *Container) error { + // TODO ensure that the user namespace is enabled + // See `man lxc.container.conf` lxc.idmap. + for _, m := range c.Spec.Linux.UIDMappings { + if err := c.SetConfigItem("lxc.idmap", fmt.Sprintf("u %d %d %d", m.ContainerID, m.HostID, m.Size)); err != nil { + return err + } + } + + for _, m := range c.Spec.Linux.GIDMappings { + if err := c.SetConfigItem("lxc.idmap", fmt.Sprintf("g %d %d %d", m.ContainerID, m.HostID, m.Size)); err != nil { + return err + } + } + + if err := c.SetConfigItem("lxc.init.uid", fmt.Sprintf("%d", c.Spec.Process.User.UID)); err != nil { + return err + } + if err := c.SetConfigItem("lxc.init.gid", fmt.Sprintf("%d", c.Spec.Process.User.GID)); err != nil { + return err + } + + if len(c.Spec.Process.User.AdditionalGids) > 0 && c.SupportsConfigItem("lxc.init.groups") { + var b strings.Builder + for i, gid := range c.Spec.Process.User.AdditionalGids { + if i > 0 { + b.WriteByte(',') + } + fmt.Fprintf(&b, "%d", gid) + } + if err := c.SetConfigItem("lxc.init.groups", b.String()); err != nil { + return err + } + } + return nil +} diff --git a/install.sh b/install.sh new file mode 100755 index 00000000..c70bf857 --- /dev/null +++ b/install.sh @@ -0,0 +1,304 @@ +#!/bin/sh -eux +# -e abort if subshell command exits non-zero +# -u treat undefined variables as error +# -x trace shell expansion + +# Package manager dependencies +# NOTE sort lists with: $(echo $PKGS | tr ' ' '\n' | sort | uniq | xargs) + +DISTRIBUTION="$(cat /etc/os-release | grep '^ID=' | cut -d'=' -f2 | tr -d '\n')" +INSTALL_PREFIX=${INSTALL_PREFIX:-/usr/local} +TMPDIR=${TMPDIR:-/tmp/lxcri-build} + +case "$DISTRIBUTION" in +"debian" | "ubuntu") + INSTALL_PKGS=apt_install + CLEAN_PKGS=apt_clean + + export DEBIAN_FRONTEND=noninteractive + + PKGS_BUILD="automake build-essential ca-certificates git libc6-dev libtool make pkg-config wget" + PKGS_BUILD="$PKGS_BUILD libapparmor-dev libbtrfs-dev libc6-dev libcap-dev libdevmapper-dev libglib2.0-dev libseccomp-dev" + + PKGS_RUNTIME="libapparmor1 libbtrfs0 libcap2 libdevmapper1.02.1 libseccomp2" + PKGS="conntrack ebtables ethtool iproute2 iptables socat" + PKGS="$PKGS ca-certificates libglib2.0-0 systemd tzdata" + PKGS="$PKGS $PKGS_RUNTIME" + ;; +"arch") + INSTALL_PKGS=pacman_install + CLEAN_PKGS=pacman_clean + + BUILD_PKGS="" + BUILD_PKGS="$PKGS_PKGS " + + PKGS_RUNTIME="" + PKGS="" + PKGS="$PKGS " + PKGS="$PKGS $PKGS_RUNTIME" + ;; +"alpine") + INSTALL_PKGS=apk_install + CLEAN_PKGS=apk_clean + + PKGS_BUILD="build-base wget git libtool m4 automake autoconf" + PKGS_BUILD="$PKGS_BUILD btrfs-progs-dev glib-dev libseccomp-dev libcap-dev libapparmor-dev" + + PKGS_RUNTIME="libapparmor btrfs-progs libcap lvm2-dev libseccomp libc6-compat libgcc" + PKGS="conntrack-tools ebtables ethtool iproute2 iptables ip6tables socat" + PKGS="$PKGS ca-certificates glib runit tzdata" + PKGS="$PKGS $PKGS_RUNTIME" + + export MUSL_CC="cc" + ;; +*) + echo "unsupported distribution '$DISTRIBUTION'" + exit 1 + ;; +esac + +mkdir -p $TMPDIR +export PATH=${INSTALL_PREFIX}/go/bin:$PATH + +setup() { + $INSTALL_PKGS $@ + add_golang +} + +clean() { + $CLEAN_PKGS $PKGS_BUILD + remove_golang + rm -rf $TMPDIR +} + +apt_install() { + apt-get update + apt-get install -qq --no-install-recommends --yes $@ +} + +apt_clean() { + apt-get purge -qq --yes $@ + apt-get autoremove -qq --yes + apt-get clean -qq + rm -rf /var/lib/apt/lists/* +} + +pacman_install() { + echo "not implemented" + exit 1 +} + +pacman_clean() { + echo "not implemented" + exit 1 +} + +apk_install() { + echo http://nl.alpinelinux.org/alpine/edge/testing >>/etc/apk/repositories + echo http://nl.alpinelinux.org/alpine/edge/community >>/etc/apk/repositories + apk add --no-cache --update $@ +} + +apk_clean() { + apk del $@ +} + +ldconfig_add() { + if $(which ldconfig 1>/dev/null 2>&1); then + echo $1 >>/etc/ld.so.conf.d/local.conf + ldconfig + fi + # alpine uses musl libc + # /etc/ld-musl-x86_64.path (shared library search path, with components delimited by newlines or colons) + # default "/lib:/usr/local/lib:/usr/lib" + # see musl-libc.org/doc/1.0.0/manual.html +} + +add_golang() { + local src=$GOLANG_SRC + local checksum=$GOLANG_CHECKSUM + local archive="$(basename $src)" + + cd ${INSTALL_PREFIX} + wget --quiet $src + echo "$checksum $archive" | sha256sum -c + tar -xzf $archive + rm ${INSTALL_PREFIX}/$archive +} + +remove_golang() { + rm -rf $(go env GOPATH) + rm -rf $(go env GOCACHE) + rm -rf $(go env GOROOT) +} + +git_clone() { + local tmpdir=$1 + local repo=$2 + local version=$3 + + git clone $repo $tmpdir + cd $tmpdir + git reset --hard $version +} + +add_cni() { + local repo=$CNI_PLUGINS_GIT_REPO + local version=$CNI_PLUGINS_GIT_VERSION + local tmpdir=${TMPDIR}/cni-plugins + + git_clone $tmpdir $repo $version + + ./build_linux.sh + export CNI_PLUGIN_DIR=$INSTALL_PREFIX/cni/bin + mkdir -p $CNI_PLUGIN_DIR + cp bin/* $CNI_PLUGIN_DIR + + cd / + rm -rf $tmpdir +} + +add_conmon() { + local repo=$CONMON_GIT_REPO + local version=$CONMON_GIT_VERSION + local tmpdir=${TMPDIR}/conmon + + git_clone $tmpdir $repo $version + + make clean + make install + + cd / + rm -rf $tmpdir +} + +add_crio() { + local repo=$CRIO_GIT_REPO + local version=$CRIO_GIT_VERSION + local tmpdir=${TMPDIR}/cri-o + + git_clone $tmpdir $repo $version + + make install + + cd / + rm -rf $tmpdir + + # Modify systemd service file to run with full privileges. + # This is required for the runtime to set cgroupv2 device controller eBPF. + sed -i 's/ExecStart=\//ExecStart=+\//' ${INSTALL_PREFIX}/lib/systemd/system/crio.service + + # TODO modify defaults file +} + +add_crictl() { + local checksum=$CRICTL_CHECKSUM + local url=$CRICTL_URL + local archive="$(basename $CRICTL_URL)" + + cd ${TMPDIR} + wget --quiet $url + echo "$checksum $archive" | sha256sum -c + tar -x -z -f $archive -C ${INSTALL_PREFIX}/bin + rm $archive +} + +add_kubernetes() { + local checksum=$K8S_CHECKSUM + local url=$K8S_URL + local archive=$(basename $K8S_URL) + + cd ${TMPDIR} + wget --quiet $url + echo "$checksum $archive" | sha512sum -c + tar -x -z -f $archive -C $INSTALL_PREFIX/bin --strip-components=3 \ + kubernetes/server/bin/kubectl kubernetes/server/bin/kubeadm kubernetes/server/bin/kubelet + rm $archive +} + +LXC_INSTALL_TOOLS=${LXC_INSTALL_TOOLS:-no} +LXC_INSTALL_COMMANDS=${LXC_INSTALL_COMMANDS:-no} +LXC_INSTALL_DOC=${LXC_INSTALL_DOC:-no} +LXC_INSTALL_API_DOCS=${LXC_INSTALL_API_DOCS:-no} + +add_lxc() { + local repo=$LXC_GIT_REPO + local version=$LXC_GIT_VERSION + local tmpdir=${TMPDIR}/lxc + + git_clone $tmpdir $repo $version + + ./autogen.sh + ./configure --enable-bash=no --enable-seccomp=yes \ + --enable-capabilities=yes --enable-apparmor=yes \ + --enable-tools=$LXC_INSTALL_TOOLS --enable-commands=$LXC_INSTALL_COMMANDS \ + --enable-static=no --enable-examples=no \ + --enable-doc=$LXC_INSTALL_DOC --enable-api-docs=$LXC_INSTALL_API_DOCS + make install + git describe --tags >${INSTALL_PREFIX}/lib/liblxc.version.txt + + ldconfig_add ${INSTALL_PREFIX}/lib + cd + rm -rf $tmpdir +} + +add_lxcri() { + local repo=$LXCRI_GIT_REPO + local version=$LXCRI_GIT_VERSION + local tmpdir=${TMPDIR}/lxcri + + git_clone $tmpdir $repo $version + + # lxc installed from source with default installation prefix is prefered + export PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig + make install + + cd + rm -rf $tmpdir +} + +install_all_noclean() { + setup $PKGS_BUILD $PKGS + add_lxc + add_lxcri + add_conmon + add_crio + add_cni + add_crictl + add_kubernetes +} + +install_all() { + install_all_noclean + clean +} + +install_runtime_noclean() { + setup $PKGS_BUILD $PKGS_RUNTIME + add_lxc + add_lxcri +} + +install_runtime() { + install_runtime_noclean + clean +} + +update_runtime() { + add_lxc + add_lxcri + clean +} + +update_lxcri() { + setup $PKGS_BUILD $PKGS_RUNTIME + add_lxcri + clean +} + +update_lxcri_dev() { + add_lxcri + clean +} + +$@ diff --git a/lint.yaml b/lint.yaml deleted file mode 100644 index f58ce3b1..00000000 --- a/lint.yaml +++ /dev/null @@ -1,17 +0,0 @@ -issues: - exclude: - - 'Error return value of .((os\.)?std(out|err)\..*|.*Close|.*Flush|os\.Remove(All)?|.*printf?|os\.(Un)?Setenv). is not checked' - - 'error strings should not be capitalized' - - 'error strings should not end with punctuation' - - 'File is not `goimports`-ed' - - 'has \d* occurrences, make it a constant' - - 'line is \d* characters' - - 'is a global variable' - - 'ifElseChain: rewrite if-else to switch statement' - - 'Error return value of `.*` is not checked' - - 'cyclomatic complexity \d* of func' - - 'G107: Potential HTTP request made with variable url' - - 'should have name of the form ErrFoo' - - 'naked return in func' - - 'by other packages, and that stutters; consider calling this' - - 'File is not `gofmt`-ed with `-s`' diff --git a/mount.go b/mount.go new file mode 100644 index 00000000..ef0ef601 --- /dev/null +++ b/mount.go @@ -0,0 +1,203 @@ +package lxcri + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +func removeMountOptions(rt *Runtime, fs string, opts []string, unsupported ...string) []string { + supported := make([]string, 0, len(opts)) + for _, opt := range opts { + addOption := true + for _, u := range unsupported { + if opt == u { + addOption = false + break + } + } + if addOption { + supported = append(supported, opt) + } else { + rt.Log.Info().Str("fs", fs).Str("option", opt).Msg("removed mount option") + } + } + return supported +} + +func filterMountOptions(rt *Runtime, fs string, opts []string) []string { + switch fs { + case "sysfs": + return removeMountOptions(rt, fs, opts, "rslave") + case "tmpfs": + // TODO make this configurable per filesystem + return removeMountOptions(rt, fs, opts, "rprivate", "tmpcopyup") + case "cgroup2": + // TODO make this configurable per filesystem + return removeMountOptions(rt, fs, opts, "private", "rslave") + } + return opts +} + +func configureMounts(rt *Runtime, c *Container) error { + // excplicitly disable auto-mounting + if err := c.SetConfigItem("lxc.mount.auto", ""); err != nil { + return err + } + + for i := range c.Spec.Mounts { + ms := c.Spec.Mounts[i] + if ms.Type == "cgroup" { + // TODO check if hieararchy is cgroup v2 only (unified mode) + ms.Type = "cgroup2" + ms.Source = "cgroup2" + // cgroup filesystem is automounted even with lxc.rootfs.managed = 0 + // from 'man lxc.container.conf': + // If cgroup namespaces are enabled, then any cgroup auto-mounting request will be ignored, + // since the container can mount the filesystems itself, and automounting can confuse the container. + } + + // TODO replace with symlink.FollowSymlinkInScope(filepath.Join(rootfs, "/etc/passwd"), rootfs) ? + // "github.com/docker/docker/pkg/symlink" + mountDest, err := resolveMountDestination(c.Spec.Root.Path, ms.Destination) + // Intermediate path resolution failed. This is not an error, since + // the remaining directories / files are automatically created (create=dir|file) + rt.Log.Trace().Err(err).Str("file", ms.Destination).Str("target", mountDest).Msg("resolve mount destination") + + // Check whether the resolved destination of the target link escapes the rootfs. + if !strings.HasPrefix(mountDest, c.Spec.Root.Path) { + // refuses mount destinations that escape from rootfs + return fmt.Errorf("resolved mount target path %s escapes from container root %s", mountDest, c.Spec.Root.Path) + } + + ms.Destination = mountDest + + if err := createMountDestination(c, &ms); err != nil { + return err + } + + ms.Options = filterMountOptions(rt, ms.Type, ms.Options) + + mnt := fmt.Sprintf("%s %s %s %s", ms.Source, ms.Destination, ms.Type, strings.Join(ms.Options, ",")) + + if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { + return err + } + } + return nil +} + +// createMountDestination creates non-existent mount destination paths. +// This is required if rootfs is mounted readonly. +// When the source is a file that should be bind mounted a destination file is created. +// In any other case a target directory is created. +// We add 'create=dir' or 'create=file' to mount options because the mount destination +// may be shadowed by a previous mount. In this case lxc will create the mount destination. +// TODO check whether this is desired behaviour in lxc ? +// Shouldn't the rootfs should be mounted readonly after all mounts destination directories have been created ? +// https://github.com/lxc/lxc/issues/1702 +func createMountDestination(c *Container, ms *specs.Mount) error { + info, err := os.Stat(ms.Source) + + // source for bind mount must exist + if err != nil && ms.Type == "bind" { + for _, o := range ms.Options { + if o == "optional" { + return nil + } + } + return errorf("failed to access bind mount source %s: %w", ms.Source, err) + } + + if err != nil || info.IsDir() { + ms.Options = append(ms.Options, "create=dir") + if c.Spec.Root.Readonly { + return os.MkdirAll(ms.Destination, 0755) + } + return nil + } + + ms.Options = append(ms.Options, "create=file") + if c.Spec.Root.Readonly { + if err := os.MkdirAll(filepath.Dir(ms.Destination), 0755); err != nil { + return fmt.Errorf("failed to create mount destination dir: %w", err) + } + f, err := os.OpenFile(ms.Destination, os.O_CREATE, 0755) + if err != nil { + return fmt.Errorf("failed to create file mountpoint: %w", err) + } + return f.Close() + } + return nil +} + +func resolvePathRelative(rootfs string, currentPath string, subPath string) (string, error) { + p := filepath.Join(currentPath, subPath) + + stat, err := os.Lstat(p) + if err != nil { + // target does not exist, resolution ends here + return p, err + } + + if stat.Mode()&os.ModeSymlink == 0 { + return p, nil + } + // resolve symlink + + linkDst, err := os.Readlink(p) + if err != nil { + return p, err + } + + // The destination of an absolute link must be prefixed with the rootfs + if filepath.IsAbs(linkDst) { + if strings.HasPrefix(linkDst, rootfs) { + return p, nil + } + return filepath.Join(rootfs, linkDst), nil + } + + // The link target is relative to currentPath. + return filepath.Clean(filepath.Join(currentPath, linkDst)), nil +} + +// resolveMountDestination resolves mount destination paths for LXC. +// +// Symlinks in mount mount destination paths are not allowed in LXC. +// See CVE-2015-1335: Protect container mounts against symlinks +// and https://github.com/lxc/lxc/commit/592fd47a6245508b79fe6ac819fe6d3b2c1289be +// Mount targets that contain symlinks should be resolved relative to the container rootfs. +// e.g k8s service account tokens are mounted to /var/run/secrets/kubernetes.io/serviceaccount +// but /var/run is (mostly) a symlink to /run, so LXC denies to mount the serviceaccount token. +// +// The mount destination must be either relative to the container root or absolute to +// the directory on the host containing the rootfs. +// LXC simply ignores relative mounts paths to an absolute rootfs. +// See man lxc.container.conf #MOUNT POINTS +// +// The mount option `create=dir` should be set when the error os.ErrNotExist is returned. +// The non-existent directories are then automatically created by LXC. + +// source /var/run/containers/storage/overlay-containers/51230afad17aa3b42901f6d9efcba406511821b7e18b2223a6b4c43f9327ce97/userdata/resolv.conf +// destination /etc/resolv.conf +func resolveMountDestination(rootfs string, dst string) (dstPath string, err error) { + // get path entries + entries := strings.Split(strings.TrimPrefix(dst, "/"), "/") + + currentPath := rootfs + // start path resolution at rootfs + for i, entry := range entries { + currentPath, err = resolvePathRelative(rootfs, currentPath, entry) + if err != nil { + // The already resolved path is concatenated with the remaining path, + // if resolution of path fails at some point. + currentPath = filepath.Join(currentPath, filepath.Join(entries[i+1:]...)) + break + } + } + return currentPath, err +} diff --git a/mount_test.go b/mount_test.go new file mode 100644 index 00000000..92a52a49 --- /dev/null +++ b/mount_test.go @@ -0,0 +1,83 @@ +package lxcri + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestResolveMountDestination_absolute(t *testing.T) { + tmpdir, err := os.MkdirTemp("", "golang.test") + require.NoError(t, err) + defer os.RemoveAll(tmpdir) + err = os.MkdirAll(filepath.Join(tmpdir, "folder1"), 0750) + require.NoError(t, err) + err = os.MkdirAll(filepath.Join(tmpdir, "folder2"), 0750) + require.NoError(t, err) + err = os.MkdirAll(filepath.Join(tmpdir, "folder3"), 0750) + require.NoError(t, err) + err = os.Symlink("/folder2", filepath.Join(tmpdir, "folder1", "f2")) + require.NoError(t, err) + err = os.Symlink("/folder3", filepath.Join(tmpdir, "folder2", "f3")) + require.NoError(t, err) + err = os.WriteFile(filepath.Join(tmpdir, "folder3", "test.txt"), []byte("hello"), 0640) + require.NoError(t, err) + + p, err := resolveMountDestination(tmpdir, "/folder1/f2/f3/test.txt") + require.Equal(t, filepath.Join(tmpdir, "/folder3/test.txt"), p) + require.NoError(t, err) + + p, err = resolveMountDestination(tmpdir, "/folder1/f2/xxxxx/fooo") + require.Equal(t, filepath.Join(tmpdir, "/folder2/xxxxx/fooo"), p) + require.Error(t, err, os.ErrExist) + + p, err = resolveMountDestination(tmpdir, "/folder1/f2/f3/hello.txt") + require.Equal(t, filepath.Join(tmpdir, "/folder3/hello.txt"), p) + require.Error(t, err, os.ErrExist) +} + +func TestResolveMountDestination_relative(t *testing.T) { + tmpdir, err := os.MkdirTemp("", "golang.test") + require.NoError(t, err) + defer os.RemoveAll(tmpdir) + err = os.MkdirAll(filepath.Join(tmpdir, "folder1"), 0750) + require.NoError(t, err) + err = os.MkdirAll(filepath.Join(tmpdir, "folder2"), 0750) + require.NoError(t, err) + err = os.MkdirAll(filepath.Join(tmpdir, "folder3"), 0750) + require.NoError(t, err) + err = os.Symlink("../folder2", filepath.Join(tmpdir, "folder1", "f2")) + require.NoError(t, err) + err = os.Symlink("../folder3", filepath.Join(tmpdir, "folder2", "f3")) + require.NoError(t, err) + err = os.WriteFile(filepath.Join(tmpdir, "folder3", "test.txt"), []byte("hello"), 0640) + require.NoError(t, err) + + //err = os.Symlink("../../folder2", filepath.Join(tmpdir, "folder1", "f2")) + //require.NoError(t, err) + + p, err := resolveMountDestination(tmpdir, "/folder1/f2/f3/test.txt") + require.Equal(t, filepath.Join(tmpdir, "/folder3/test.txt"), p) + require.NoError(t, err) + + p, err = resolveMountDestination(tmpdir, "/folder1/f2/xxxxx/fooo") + require.Equal(t, filepath.Join(tmpdir, "/folder2/xxxxx/fooo"), p) + require.Error(t, err, os.ErrExist) + + p, err = resolveMountDestination(tmpdir, "/folder1/f2/f3/hello.txt") + require.Equal(t, filepath.Join(tmpdir, "/folder3/hello.txt"), p) + require.Error(t, err, os.ErrExist) +} + +func TestFilterMountOptions(t *testing.T) { + opts := strings.Split("rw,rprivate,noexec,nosuid,nodev,tmpcopyup,create=dir", ",") + rt := Runtime{} + out := filterMountOptions(&rt, "tmpfs", opts) + require.Equal(t, []string{"rw", "noexec", "nosuid", "nodev", "create=dir"}, out) + + out = filterMountOptions(&rt, "nosuchfs", opts) + require.Equal(t, opts, out) +} diff --git a/namespaces.go b/namespaces.go new file mode 100644 index 00000000..e7fc0c02 --- /dev/null +++ b/namespaces.go @@ -0,0 +1,173 @@ +package lxcri + +import ( + "fmt" + "os" + "runtime" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +// namespace is a mapping from the namespace name +// as used in /proc/{pid}/ns and the namespace clone flag, +// as defined in `man 2 clone`. +type namespace struct { + Name string + CloneFlag int +} + +var ( + cgroupNamespace = namespace{"cgroup", unix.CLONE_NEWCGROUP} + ipcNamespace = namespace{"ipc", unix.CLONE_NEWIPC} + mountNamespace = namespace{"mnt", unix.CLONE_NEWNS} + networkNamespace = namespace{"net", unix.CLONE_NEWNET} + pidNamespace = namespace{"pid", unix.CLONE_NEWPID} + timeNamespace = namespace{"time", unix.CLONE_NEWTIME} + userNamespace = namespace{"user", unix.CLONE_NEWUSER} + utsNamespace = namespace{"uts", unix.CLONE_NEWUTS} + + namespaceMap = map[specs.LinuxNamespaceType]namespace{ + specs.CgroupNamespace: cgroupNamespace, + specs.IPCNamespace: ipcNamespace, + specs.MountNamespace: mountNamespace, + specs.NetworkNamespace: networkNamespace, + specs.PIDNamespace: pidNamespace, + // specs.timeNamespace: timeNamespace, + specs.UserNamespace: userNamespace, + specs.UTSNamespace: utsNamespace, + } +) + +func cloneFlags(namespaces []specs.LinuxNamespace) (int, error) { + flags := 0 + for _, ns := range namespaces { + n, exist := namespaceMap[ns.Type] + if !exist { + return 0, fmt.Errorf("namespace %s is not supported", ns.Type) + } + flags |= n.CloneFlag + } + return flags, nil +} + +func configureNamespaces(c *Container) error { + seenNamespaceTypes := map[specs.LinuxNamespaceType]bool{} + cloneNamespaces := make([]string, 0, len(c.Spec.Linux.Namespaces)) + + for _, ns := range c.Spec.Linux.Namespaces { + if _, seen := seenNamespaceTypes[ns.Type]; seen { + return fmt.Errorf("duplicate namespace %s", ns.Type) + } + seenNamespaceTypes[ns.Type] = true + + n, supported := namespaceMap[ns.Type] + if !supported { + return fmt.Errorf("unsupported namespace %s", ns.Type) + } + + if ns.Path == "" { + cloneNamespaces = append(cloneNamespaces, n.Name) + continue + } + + configKey := fmt.Sprintf("lxc.namespace.share.%s", n.Name) + if err := c.SetConfigItem(configKey, ns.Path); err != nil { + return err + } + } + + return c.SetConfigItem("lxc.namespace.clone", strings.Join(cloneNamespaces, " ")) +} + +func isNamespaceEnabled(spec *specs.Spec, nsType specs.LinuxNamespaceType) bool { + for _, ns := range spec.Linux.Namespaces { + if ns.Type == nsType { + return true + } + } + return false +} + +func getNamespace(spec *specs.Spec, nsType specs.LinuxNamespaceType) *specs.LinuxNamespace { + for _, n := range spec.Linux.Namespaces { + if n.Type == nsType { + return &n + } + } + return nil +} + +// isNamespaceSharedWithHost returns true if the given namespace is nil. +// If the given namespace is not nil then the namespace then true is +// returned if the namespace path refers to the host namespace and +// false otherwise. +// Should be used with isNamespaceSharedWithHost(getNamespace(...)) +func isNamespaceSharedWithRuntime(ns *specs.LinuxNamespace) (bool, error) { + // no namespace with this name defined + if ns == nil { + return true, nil + } + // namespaces without a target path are cloned + if ns.Path == "" { + return false, nil + } + + // from `man namespaces` The /proc/[pid]/ns/ directory [...] + // In Linux 3.7 and earlier, these files were visible as hard links. + // If two processes are in the same namespace, then the device IDs and inode numbers + // of their /proc/[pid]/ns/xxx symbolic links will be the same; + // anapplication can check this using the stat.st_dev and stat.st_ino + // fields returned by stat(2). + // e.g `strace /usr/bin/stat -L /proc/1/ns/pid` + + var stat unix.Stat_t + err := unix.Stat(ns.Path, &stat) + if err != nil { + return false, err + } + + var stat1 unix.Stat_t + err = unix.Stat("/proc/self/ns/pid", &stat1) + if err != nil { + return false, err + } + + sameNS := (stat.Dev == stat1.Dev) && (stat.Ino == stat1.Ino) + return sameNS, nil +} + +// lxc does not set the hostname on shared namespaces +func setHostname(nsPath string, hostname string) error { + // setns only affects the current thread + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + f, err := os.Open(nsPath) + if err != nil { + return fmt.Errorf("failed to open container uts namespace %q: %w", nsPath, err) + } + // #nosec + defer f.Close() + + self, err := os.Open("/proc/self/ns/uts") + if err != nil { + return fmt.Errorf("failed to open uts namespace : %w", err) + } + // #nosec + defer func() { + unix.Setns(int(self.Fd()), unix.CLONE_NEWUTS) + self.Close() + }() + + err = unix.Setns(int(f.Fd()), unix.CLONE_NEWUTS) + if err != nil { + return fmt.Errorf("failed to switch to UTS namespace %s: %w", nsPath, err) + } + err = unix.Sethostname([]byte(hostname)) + if err != nil { + return fmt.Errorf("unix.Sethostname failed: %w", err) + } + return nil +} diff --git a/pkg/internal/lxcri-test/main.go b/pkg/internal/lxcri-test/main.go new file mode 100644 index 00000000..50c11bb2 --- /dev/null +++ b/pkg/internal/lxcri-test/main.go @@ -0,0 +1,59 @@ +package main + +import ( + "fmt" + "io" + "os" + "os/signal" + "strconv" + "syscall" + "time" +) + +var logPrefix string + +func init() { + logPrefix = fmt.Sprintf(">> %s(pid:%d) ", os.Args[0], os.Getpid()) +} + +func logf(format string, args ...interface{}) { + fmt.Printf(logPrefix+format+"\n", args...) +} + +func main() { + sigs := make(chan os.Signal, 1) + + // SIGHUP by default terminates the process, if the process does not catch it. + // `nohup` can be used ignore it. + // see https://en.wikipedia.org/wiki/SIGHUP + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1) + + go func() { + sig := <-sigs + logf("received signal %q", sig) + }() + + logf("begin") + + sec := 3 + if s, ok := os.LookupEnv("SLEEP"); ok { + n, err := strconv.Atoi(s) + if err != nil { + panic(err) + } + logf("using env SLEEP value %s", s) + sec = n + } + + f, err := os.Open("/proc/self/mounts") + if err != nil { + panic(err) + } + logf("writing /proc/self/mounts") + io.Copy(os.Stdout, f) + + logf("sleeping for %d seconds", sec) + time.Sleep(time.Second * time.Duration(sec)) + + logf("end") +} diff --git a/pkg/log/log.go b/pkg/log/log.go new file mode 100644 index 00000000..18118723 --- /dev/null +++ b/pkg/log/log.go @@ -0,0 +1,72 @@ +// Package log provides logging for lxcri. +package log + +import ( + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/rs/zerolog" +) + +// zerlog log levels are mirrored for convenience. +const ( + TraceLevel = zerolog.TraceLevel + DebugLevel = zerolog.DebugLevel + InfoLevel = zerolog.InfoLevel + WarnLevel = zerolog.WarnLevel + ErrorLevel = zerolog.ErrorLevel + FatalLevel = zerolog.FatalLevel + PanicLevel = zerolog.PanicLevel +) + +func init() { + zerolog.LevelFieldName = "l" + zerolog.MessageFieldName = "m" + + // match liblxc timestamp format + zerolog.TimestampFieldName = "t" + //zerolog.TimeFieldFormat = "20060102150405.000" + zerolog.TimestampFunc = func() time.Time { + return time.Now().UTC() + } + + zerolog.CallerFieldName = "c" + zerolog.CallerMarshalFunc = func(file string, line int) string { + return filepath.Base(file) + ":" + strconv.Itoa(line) + } +} + +// OpenFile opens a new or appends to an existing log file. +// The parent directory is created if it does not exist. +func OpenFile(name string, mode os.FileMode) (*os.File, error) { + logDir := filepath.Dir(name) + err := os.MkdirAll(logDir, 0750) + if err != nil { + return nil, fmt.Errorf("failed to create log file directory %s: %w", logDir, err) + } + return os.OpenFile(name, os.O_WRONLY|os.O_APPEND|os.O_CREATE, mode) +} + +// ParseLevel is a wrapper for zerolog.ParseLevel +func ParseLevel(level string) (zerolog.Level, error) { + return zerolog.ParseLevel(strings.ToLower(level)) +} + +// NewLogger creates a new zerlog.Context from the given arguments. +// The returned context is configured to log with timestamp and caller information. +func NewLogger(out io.Writer, level zerolog.Level) zerolog.Context { + // NOTE Unfortunately it's not possible change the possition of the timestamp. + // The ttimestamp is appended to the to the log output because it is dynamically rendered + // see https://github.com/rs/zerolog/issues/109 + return zerolog.New(out).Level(level).With().Timestamp().Caller() +} + +// ConsoleLogger returns a new zerlog.Logger suited for console usage (e.g unit tests) +func ConsoleLogger(color bool, level zerolog.Level) zerolog.Logger { + return zerolog.New(zerolog.ConsoleWriter{Out: os.Stdout, NoColor: !color}).Level(level).With().Timestamp().Caller().Logger() +} diff --git a/pkg/specki/specki.go b/pkg/specki/specki.go new file mode 100644 index 00000000..b9b65589 --- /dev/null +++ b/pkg/specki/specki.go @@ -0,0 +1,366 @@ +// Package specki provides helper functions to process OCI container specs. +// These functions should not contain any code that is `lxcri` specific. +package specki + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// UnmapContainerID returns the (user/group) ID to which the given +// ID is mapped to by the given idmaps. +// The returned id will be equal to the given id +// if it is not mapped by the given idmaps. +func UnmapContainerID(id uint32, idmaps []specs.LinuxIDMapping) uint32 { + for _, idmap := range idmaps { + if idmap.Size < 1 { + continue + } + maxID := idmap.ContainerID + idmap.Size - 1 + // check if c.Process.UID is contained in the mapping + if (id >= idmap.ContainerID) && (id <= maxID) { + offset := id - idmap.ContainerID + hostid := idmap.HostID + offset + return hostid + } + } + // uid is not mapped + return id +} + +// RunHooks calls RunHook for each of the given runtime hooks. +// The given runtime state is serialized as JSON and passed to each RunHook call. +func RunHooks(ctx context.Context, state *specs.State, hooks []specs.Hook, continueOnError bool) error { + if len(hooks) == 0 { + return nil + } + stateJSON, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("failed to serialize spec state: %w", err) + } + for i, h := range hooks { + fmt.Printf("running hook[%d] path:%s\n", i, h.Path) + err := RunHook(ctx, stateJSON, h) + if err != nil { + fmt.Printf("hook[%d] failed: %s\n", i, err) + if !continueOnError { + return err + } + } + } + return nil +} + +// RunHook executes the command defined by the given hook. +// The given runtime state is passed over stdin to the executed command. +// The command is executed with the given context ctx, or a sub-context +// of it if Hook.Timeout is not nil. +func RunHook(ctx context.Context, stateJSON []byte, hook specs.Hook) error { + if hook.Timeout != nil { + hookCtx, cancel := context.WithTimeout(ctx, time.Second*time.Duration(*hook.Timeout)) + defer cancel() + ctx = hookCtx + } + cmd := exec.CommandContext(ctx, hook.Path, hook.Args...) + cmd.Env = hook.Env + cmd.Stderr = os.Stderr + cmd.Stdout = os.Stdout + in, err := cmd.StdinPipe() + if err != nil { + return fmt.Errorf("failed to get stdin pipe: %w", err) + } + if err := cmd.Start(); err != nil { + return err + } + if _, err := io.Copy(in, bytes.NewReader(stateJSON)); err != nil { + return err + } + in.Close() + return cmd.Wait() +} + +// DecodeJSONFile reads the next JSON-encoded value from +// the file with the given filename and stores it in the value pointed to by v. +func DecodeJSONFile(filename string, v interface{}) error { + // #nosec + f, err := os.Open(filename) + if err != nil { + return err + } + // #nosec + err = json.NewDecoder(f).Decode(v) + if err != nil { + f.Close() + return fmt.Errorf("failed to decode JSON from %s: %w", filename, err) + } + err = f.Close() + if err != nil { + return fmt.Errorf("failed to close %s: %w", filename, err) + } + return nil +} + +// EncodeJSONFile writes the JSON encoding of v followed by a newline character +// to the file with the given filename. +// The file is opened read-write with the (optional) provided flags. +// The permission bits perm (not affected by umask) are set after the file was closed. +func EncodeJSONFile(filename string, v interface{}, flags int, perm os.FileMode) error { + f, err := os.OpenFile(filename, os.O_RDWR|flags, perm) + if err != nil { + return err + } + enc := json.NewEncoder(f) + err = enc.Encode(v) + if err != nil { + f.Close() + return fmt.Errorf("failed to encode JSON to %s: %w", filename, err) + } + err = f.Close() + if err != nil { + return fmt.Errorf("failed to close %s: %w", filename, err) + } + // Use chmod because initial perm is affected by umask and flags. + err = os.Chmod(filename, perm) + if err != nil { + return fmt.Errorf("failed to 'chmod %o %s': %w", perm, filename, err) + } + return nil +} + +func int64p(v int64) *int64 { + return &v +} + +func modep(m os.FileMode) *os.FileMode { + return &m +} + +// FIXME runtime mandates that /dev/ptmx should be bind mount from host - why ? +// `man 2 mount` | devpts +// ` To use this option effectively, /dev/ptmx must be a symbolic link to pts/ptmx. +// See Documentation/filesystems/devpts.txt in the Linux kernel source tree for details.` +var ( + EssentialDevices = []specs.LinuxDevice{ + specs.LinuxDevice{Type: "c", Major: 1, Minor: 3, FileMode: modep(0666), Path: "/dev/null"}, + specs.LinuxDevice{Type: "c", Major: 1, Minor: 5, FileMode: modep(0666), Path: "/dev/zero"}, + specs.LinuxDevice{Type: "c", Major: 1, Minor: 7, FileMode: modep(0666), Path: "/dev/full"}, + specs.LinuxDevice{Type: "c", Major: 1, Minor: 8, FileMode: modep(0666), Path: "/dev/random"}, + specs.LinuxDevice{Type: "c", Major: 1, Minor: 9, FileMode: modep(0666), Path: "/dev/urandom"}, + specs.LinuxDevice{Type: "c", Major: 5, Minor: 0, FileMode: modep(0666), Path: "/dev/tty"}, + } + + EssentialDevicesAllow = []specs.LinuxDeviceCgroup{ + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(1), Minor: int64p(3), Access: "rwm"}, // null + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(1), Minor: int64p(5), Access: "rwm"}, // zero + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(1), Minor: int64p(7), Access: "rwm"}, // full + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(1), Minor: int64p(8), Access: "rwm"}, // random + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(1), Minor: int64p(9), Access: "rwm"}, // urandom + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(5), Minor: int64p(0), Access: "rwm"}, // tty + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(5), Minor: int64p(2), Access: "rwm"}, // ptmx + specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: int64p(88), Access: "rwm"}, // /dev/pts/{n} + } +) + +// AllowEssentialDevices adds and allows access to EssentialDevices which are required by the +// [runtime spec](https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices) +func AllowEssentialDevices(spec *specs.Spec) error { + for _, dev := range EssentialDevices { + exist, err := IsDeviceEnabled(spec, dev) + if err != nil { + return err + } + if !exist { + spec.Linux.Devices = append(spec.Linux.Devices, dev) + } + } + + spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices, EssentialDevicesAllow...) + return nil +} + +// IsDeviceEnabled checks if the LinuxDevice dev is enabled in the Spec spec. +// An error is returned if the device Path matches and Type, Major or Minor don't match. +func IsDeviceEnabled(spec *specs.Spec, dev specs.LinuxDevice) (bool, error) { + for _, d := range spec.Linux.Devices { + if d.Path == dev.Path { + if d.Type != dev.Type { + return false, fmt.Errorf("%s type mismatch (expected %s but was %s)", dev.Path, dev.Type, d.Type) + } + if d.Major != dev.Major { + return false, fmt.Errorf("%s major number mismatch (expected %d but was %d)", dev.Path, dev.Major, d.Major) + } + if d.Minor != dev.Minor { + return false, fmt.Errorf("%s major number mismatch (expected %d but was %d)", dev.Path, dev.Major, d.Major) + } + return true, nil + } + } + return false, nil +} + +// ReadSpecJSON reads the JSON encoded OCI +// spec from the given path. +// This is a convenience function for the cli. +func ReadSpecJSON(p string) (*specs.Spec, error) { + spec := new(specs.Spec) + err := DecodeJSONFile(p, spec) + return spec, err +} + +// ReadSpecProcessJSON reads the JSON encoded OCI +// spec process definition from the given path. +// This is a convenience function for the cli. +func ReadSpecProcessJSON(src string) (*specs.Process, error) { + proc := new(specs.Process) + err := DecodeJSONFile(src, proc) + return proc, err +} + +// LoadSpecProcess calls ReadSpecProcessJSON if the given specProcessPath is not empty, +// otherwise it creates a new specs.Process from the given args. +// It's an error if both values are empty. +func LoadSpecProcess(specProcessPath string, args []string) (*specs.Process, error) { + if specProcessPath != "" { + return ReadSpecProcessJSON(specProcessPath) + } + if len(args) == 0 { + return nil, fmt.Errorf("spec process path and args are empty") + } + return &specs.Process{Cwd: "/", Args: args}, nil +} + +// NewSpec returns a minimal spec.Spec instance, which is +// required to run the given process within a container +// using the given rootfs. +// NOTE /proc and /dev folders must be present within the given rootfs. +func NewSpec(rootfs string, cmd string, args ...string) *specs.Spec { + proc := NewSpecProcess(cmd, args...) + + return &specs.Spec{ + Version: specs.Version, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + // isolate all namespaces by default + specs.LinuxNamespace{Type: specs.PIDNamespace}, + specs.LinuxNamespace{Type: specs.MountNamespace}, + specs.LinuxNamespace{Type: specs.IPCNamespace}, + specs.LinuxNamespace{Type: specs.UTSNamespace}, + specs.LinuxNamespace{Type: specs.CgroupNamespace}, + specs.LinuxNamespace{Type: specs.NetworkNamespace}, + }, + Devices: EssentialDevices, + Resources: &specs.LinuxResources{ + Devices: EssentialDevicesAllow, + }, + }, + Mounts: []specs.Mount{ + specs.Mount{Destination: "/proc", Source: "proc", Type: "proc", + Options: []string{"rw", "nosuid", "nodev", "noexec", "relatime"}, + }, + specs.Mount{Destination: "/dev", Source: "tmpfs", Type: "tmpfs", + Options: []string{"rw", "nosuid", "noexec", "relatime", "dev"}, + // devtmpfs (rw,nosuid,relatime,size=6122620k,nr_inodes=1530655,mode=755,inode64) + }, + }, + Process: proc, + Root: &specs.Root{Path: rootfs}, + } +} + +// NewSpecProcess creates a specs.Process instance +// from the given command cmd and the command arguments args. +func NewSpecProcess(cmd string, args ...string) *specs.Process { + proc := new(specs.Process) + proc.Args = append(proc.Args, cmd) + proc.Args = append(proc.Args, args...) + proc.Cwd = "/" + return proc +} + +// LoadSpecStateJSON parses specs.State from the JSON encoded file filename. +func LoadSpecStateJSON(filename string) (*specs.State, error) { + state := new(specs.State) + err := DecodeJSONFile(filename, state) + return state, err +} + +// ReadSpecStateJSON parses the JSON encoded specs.State from the given reader. +func ReadSpecStateJSON(r io.Reader) (*specs.State, error) { + state := new(specs.State) + dec := json.NewDecoder(r) + err := dec.Decode(state) + return state, err +} + +// InitHook is a convenience function for OCI hooks. +// It parses specs.State from the given reader and +// loads specs.Spec from the specs.State.Bundle path. +func InitHook(r io.Reader) (rootfs string, state *specs.State, spec *specs.Spec, err error) { + state, err = ReadSpecStateJSON(r) + if err != nil { + return + } + spec, err = ReadSpecJSON(filepath.Join(state.Bundle, "config.json")) + + // quote from https://github.com/opencontainers/runtime-spec/blob/master/config.md#root + // > On POSIX platforms, path is either an absolute path or a relative path to the bundle. + // > For example, with a bundle at /to/bundle and a root filesystem at /to/bundle/rootfs, + // > the path value can be either /to/bundle/rootfs or rootfs. + // > The value SHOULD be the conventional rootfs. + rootfs = spec.Root.Path + if !filepath.IsAbs(rootfs) { + rootfs = filepath.Join(state.Bundle, rootfs) + } + return +} + +// Getenv returns the first matching value from env, +// which has a prefix of key + "=". +func Getenv(env []string, key string) (string, bool) { + for _, kv := range env { + if strings.HasPrefix(kv, key+"=") { + val := strings.TrimPrefix(kv, key+"=") + return val, true + } + } + return "", false +} + +// Setenv adds the given variable to the environment env. +// The variable is only added if it is not yet defined +// or if overwrite is set to true. +// Setenv returns the modified environment and +// true the variable is already defined or false otherwise. +func Setenv(env []string, val string, overwrite bool) ([]string, bool) { + a := strings.Split(val, "=") + key := a[0] + for i, kv := range env { + if strings.HasPrefix(kv, key+"=") { + if overwrite { + env[i] = val + } + return env, true + } + } + return append(env, val), false +} + +// BindMount returns a specs.Mount to bind mount src to dest. +// The given mount options opts are merged with the predefined options +// ("bind", "nosuid", "nodev", "relatime") +func BindMount(src string, dest string, opts ...string) specs.Mount { + return specs.Mount{ + Source: src, Destination: dest, Type: "bind", + Options: append([]string{"bind", "nosuid", "nodev", "relatime"}, opts...), + } +} diff --git a/runtime.go b/runtime.go new file mode 100644 index 00000000..8346262f --- /dev/null +++ b/runtime.go @@ -0,0 +1,430 @@ +// Package lxcri provides an OCI specific runtime interface for lxc. +package lxcri + +import ( + "context" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "time" + + "github.com/creack/pty" + "github.com/drachenfels-de/gocapability/capability" + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/rs/zerolog" + "golang.org/x/sys/unix" + "gopkg.in/lxc/go-lxc.v2" +) + +const ( + // BundleConfigFile is the name of the OCI container bundle config file. + // The content is the JSON encoded specs.Spec. + BundleConfigFile = "config.json" +) + +// Required runtime executables loaded from Runtime.LibexecDir +var ( + // ExecStart starts the liblxc monitor process, similar to lxc-start + ExecStart = "lxcri-start" + // ExecHook is run as liblxc hook and creates additional devices and remounts masked paths. + ExecHook = "lxcri-hook" + ExecHookBuiltin = "lxcri-hook-builtin" + // ExecInit is the container init process that execs the container process. + ExecInit = "lxcri-init" +) + +var ( + // ErrNotExist is returned if the container (runtime dir) does not exist. + ErrNotExist = fmt.Errorf("container does not exist") +) + +// RuntimeFeatures are (security) features supported by the Runtime. +// The supported features are enabled on any Container instance +// created by Runtime.Create. +type RuntimeFeatures struct { + Seccomp bool + Capabilities bool + Apparmor bool + CgroupDevices bool +} + +// Runtime is a factory for creating and managing containers. +// The exported methods of Runtime are required to implement the +// OCI container runtime interface spec (CRI). +// It shares the common settings +type Runtime struct { + // Log is the logger used by the runtime. + Log zerolog.Logger `json:"-"` + // Root is the file path to the runtime directory. + // Directories for containers created by the runtime + // are created within this directory. + Root string + // Use systemd encoded cgroup path (from crio-o/conmon) + // is true if /etc/crio/crio.conf#cgroup_manager = "systemd" + SystemdCgroup bool + // Path for lxc monitor cgroup (lxc specific feature). + // This is the cgroup where the liblxc monitor process (lxcri-start) + // will be placed in. It's similar to /etc/crio/crio.conf#conmon_cgroup + MonitorCgroup string + // LibexecDir is the the directory that contains the runtime executables. + LibexecDir string + // Featuress are runtime (security) features that apply to all containers + // created by the runtime. + Features RuntimeFeatures + + // Environment passed to `lxcri-start` + env []string + + caps capability.Capabilities + + specs.Hooks +} + +func (rt *Runtime) libexec(name string) string { + return filepath.Join(rt.LibexecDir, name) +} + +func (rt *Runtime) hasCapability(s string) bool { + c, exist := capability.Parse(s) + if !exist { + rt.Log.Warn().Msgf("undefined capability %q", s) + return false + } + return rt.caps.Get(capability.EFFECTIVE, c) +} + +// Init initializes the runtime instance. +// It creates required directories and checks the runtimes system configuration. +// Unsupported runtime features are disabled and a warning message is logged. +// Init must be called once for a runtime instance before calling any other method. +func (rt *Runtime) Init() error { + caps, err := capability.NewPid2(0) + if err != nil { + return errorf("failed to create capabilities object: %w", err) + } + if err := caps.Load(); err != nil { + return errorf("failed to load process capabilities: %w", err) + } + rt.caps = caps + + rt.keepEnv("HOME", "XDG_RUNTIME_DIR", "PATH") + + err = canExecute(rt.libexec(ExecStart), rt.libexec(ExecHook), rt.libexec(ExecInit)) + if err != nil { + return errorf("access check failed: %w", err) + } + + if err := isFilesystem("/proc", "proc"); err != nil { + return errorf("procfs not mounted on /proc: %w", err) + } + + cgroupRoot, err = detectCgroupRoot() + if err != nil { + rt.Log.Warn().Msgf("cgroup root detection failed: %s", err) + } + rt.Log.Info().Msgf("using cgroup root %s", cgroupRoot) + + if !lxc.VersionAtLeast(3, 1, 0) { + return errorf("liblxc runtime version is %s, but >= 3.1.0 is required", lxc.Version()) + } + + if !lxc.VersionAtLeast(4, 0, 5) { + rt.Log.Warn().Msgf("liblxc runtime version >= 4.0.5 is recommended (was %s)", lxc.Version()) + } + + rt.Hooks.CreateContainer = []specs.Hook{ + specs.Hook{Path: rt.libexec(ExecHookBuiltin)}, + } + return nil +} + +func (rt *Runtime) checkConfig(cfg *ContainerConfig) error { + if len(cfg.ContainerID) == 0 { + return errorf("missing container ID") + } + return rt.checkSpec(cfg.Spec) +} + +func (rt *Runtime) checkSpec(spec *specs.Spec) error { + if spec.Root == nil { + return errorf("spec.Root is nil") + } + if len(spec.Root.Path) == 0 { + return errorf("empty spec.Root.Path") + } + + if spec.Process == nil { + return errorf("spec.Process is nil") + } + + if len(spec.Process.Args) == 0 { + return errorf("specs.Process.Args is empty") + } + + if spec.Process.Cwd == "" { + rt.Log.Info().Msg("specs.Process.Cwd is unset defaulting to '/'") + spec.Process.Cwd = "/" + } + + yes, err := isNamespaceSharedWithRuntime(getNamespace(spec, specs.MountNamespace)) + if err != nil { + return errorf("failed to mount namespace: %s", err) + } + if yes { + return errorf("container wants to share the runtimes mount namespace") + } + + // It should be best practise not to do so, but there are containers that + // want to share the runtimes PID namespaces. e.g sonobuoy/sonobuoy-systemd-logs-daemon-set + yes, err = isNamespaceSharedWithRuntime(getNamespace(spec, specs.PIDNamespace)) + if err != nil { + return errorf("failed to check PID namespace: %s", err) + } + if yes { + rt.Log.Warn().Msg("container shares the PID namespace with the runtime") + } + return nil +} + +func (rt *Runtime) keepEnv(names ...string) { + for _, n := range names { + if val := os.Getenv(n); val != "" { + rt.env = append(rt.env, n+"="+val) + } + } +} + +// Load loads a container from the runtime directory. +// The container must have been created with Runtime.Create. +// The logger Container.Log is set to Runtime.Log by default. +// A loaded Container must be released with Container.Release after use. +func (rt *Runtime) Load(containerID string) (*Container, error) { + dir := filepath.Join(rt.Root, containerID) + if _, err := os.Stat(dir); os.IsNotExist(err) { + return nil, ErrNotExist + } + c := &Container{ + ContainerConfig: &ContainerConfig{ + Log: rt.Log, + }, + runtimeDir: dir, + } + if err := c.load(); err != nil { + return nil, err + } + return c, nil +} + +// Start starts the given container. +// Start simply unblocks the container init process `lxcri-init`, +// which then executes the actuall container process. +// The given container must have been created with Runtime.Create. +func (rt *Runtime) Start(ctx context.Context, c *Container) error { + rt.Log.Info().Msg("notify init to start container process") + + state, err := c.State() + if err != nil { + return errorf("failed to get container state: %w", err) + } + if state.SpecState.Status != specs.StateCreated { + return fmt.Errorf("invalid container state. expected %q, but was %q", specs.StateCreated, state.SpecState.Status) + } + + err = c.start(ctx) + if err != nil { + return err + } + + if c.Spec.Hooks != nil { + state, err := c.State() + if err != nil { + return errorf("failed to get container state: %w", err) + } + specki.RunHooks(ctx, &state.SpecState, c.Spec.Hooks.Poststart, true) + } + return nil +} + +func (rt *Runtime) runStartCmd(ctx context.Context, c *Container) (err error) { + // #nosec + cmd := exec.Command(rt.libexec(ExecStart), c.LinuxContainer.Name(), rt.Root, c.ConfigFilePath()) + cmd.Env = rt.env + cmd.Dir = c.RuntimePath() + + if c.ConsoleSocket == "" && !c.Spec.Process.Terminal { + // Inherit stdio from calling process (conmon). + // lxc.console.path must be set to 'none' or stdio of init process is replaced with a PTY by lxc + if err := c.SetConfigItem("lxc.console.path", "none"); err != nil { + return err + } + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + + // NOTE any config change via clxc.SetConfigItem + // must be done before calling SaveConfigFile + err = c.LinuxContainer.SaveConfigFile(c.ConfigFilePath()) + if err != nil { + return errorf("failed to save config file to %q: %w", c.ConfigFilePath(), err) + } + + rt.Log.Debug().Msg("starting lxc monitor process") + if c.ConsoleSocket != "" { + err = runStartCmdConsole(ctx, cmd, c.ConsoleSocket) + } else { + err = cmd.Start() + } + + if err != nil { + return err + } + + c.CreatedAt = time.Now() + c.Pid = cmd.Process.Pid + rt.Log.Info().Int("pid", cmd.Process.Pid).Msg("monitor process started") + + p := c.RuntimePath("lxcri.json") + err = specki.EncodeJSONFile(p, c, os.O_EXCL|os.O_CREATE, 0440) + if err != nil { + return err + } + + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + rt.Log.Debug().Msg("waiting for init") + if err := c.waitCreated(ctx); err != nil { + return err + } + + return nil +} + +func runStartCmdConsole(ctx context.Context, cmd *exec.Cmd, consoleSocket string) error { + dialer := net.Dialer{} + c, err := dialer.DialContext(ctx, "unix", consoleSocket) + if err != nil { + return fmt.Errorf("connecting to console socket failed: %w", err) + } + defer c.Close() + + conn, ok := c.(*net.UnixConn) + if !ok { + return fmt.Errorf("expected a unix connection but was %T", conn) + } + + if deadline, ok := ctx.Deadline(); ok { + err = conn.SetDeadline(deadline) + if err != nil { + return fmt.Errorf("failed to set connection deadline: %w", err) + } + } + + sockFile, err := conn.File() + if err != nil { + return fmt.Errorf("failed to get file from unix connection: %w", err) + } + ptmx, err := pty.Start(cmd) + if err != nil { + return fmt.Errorf("failed to start with pty: %w", err) + } + + // Send the pty file descriptor over the console socket (to the 'conmon' process) + // For technical backgrounds see: + // * `man sendmsg 2`, `man unix 3`, `man cmsg 1` + // * https://blog.cloudflare.com/know-your-scm_rights/ + oob := unix.UnixRights(int(ptmx.Fd())) + // Don't know whether 'terminal' is the right data to send, but conmon doesn't care anyway. + err = unix.Sendmsg(int(sockFile.Fd()), []byte("terminal"), oob, nil, 0) + if err != nil { + return fmt.Errorf("failed to send console fd: %w", err) + } + return ptmx.Close() +} + +// Kill sends the signal signum to the container init process. +func (rt *Runtime) Kill(ctx context.Context, c *Container, signum unix.Signal) error { + state, err := c.ContainerState() + if err != nil { + return err + } + if state == specs.StateStopped { + return errorf("container already stopped") + } + return c.kill(ctx, signum) +} + +// Delete removes the container from the runtime directory. +// The container must be stopped or force must be set to true. +// If the container is not stopped but force is set to true, +// the container will be killed with unix.SIGKILL. +func (rt *Runtime) Delete(ctx context.Context, containerID string, force bool) error { + rt.Log.Info().Bool("force", force).Msg("delete container") + c, err := rt.Load(containerID) + if err == ErrNotExist { + return err + } + if err != nil { + // NOTE hooks won't run in this case + rt.Log.Warn().Msgf("deleting runtime dir for unloadable container: %s", err) + return os.RemoveAll(filepath.Join(rt.Root, containerID)) + } + + defer c.Release() + + state, err := c.ContainerState() + if err != nil { + return err + } + if state != specs.StateStopped { + c.Log.Debug().Msgf("delete state:%s", state) + if !force { + return errorf("container is not not stopped (current state %s)", state) + } + if err := c.kill(ctx, unix.SIGKILL); err != nil { + return errorf("failed to kill container: %w", err) + } + } + + if err := c.waitMonitorStopped(ctx); err != nil { + c.Log.Error().Msgf("failed to stop monitor process %d", c.Pid) + } + + // From OCI runtime spec + // "Note that resources associated with the container, but not + // created by this container, MUST NOT be deleted." + // The *lxc.Container is created with `rootfs.managed=0`, + // so calling *lxc.Container.Destroy will not delete container resources. + if err := c.LinuxContainer.Destroy(); err != nil { + return fmt.Errorf("failed to destroy container: %w", err) + } + + // the monitor might be part of the cgroup so wait for it to exit + eventsFile := filepath.Join(cgroupRoot, c.CgroupDir, "cgroup.events") + err = pollCgroupEvents(ctx, eventsFile, func(ev cgroupEvents) bool { + return !ev.populated + }) + if err != nil && !os.IsNotExist(err) { + // try to delete the cgroup anyways + c.Log.Warn().Msgf("failed to wait until cgroup.events populated=0: %s", err) + } + + err = deleteCgroup(c.CgroupDir) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to delete cgroup: %s", err) + } + + if c.Spec.Hooks != nil { + state, err := c.State() + if err != nil { + return errorf("failed to get container state: %w", err) + } + specki.RunHooks(ctx, &state.SpecState, c.Spec.Hooks.Poststop, true) + } + + return os.RemoveAll(c.RuntimePath()) +} diff --git a/runtime_test.go b/runtime_test.go new file mode 100644 index 00000000..f67faa27 --- /dev/null +++ b/runtime_test.go @@ -0,0 +1,342 @@ +package lxcri + +import ( + "context" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/drachenfels-de/lxcri/pkg/log" + "github.com/drachenfels-de/lxcri/pkg/specki" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" +) + +var logLevel = "debug" +var libexecDir = "/usr/local/libexec/lxcri" +var tmpRoot = "." + +func init() { + // NOTE keep environment variables in sync with `lxcri` cli + if val, ok := os.LookupEnv("LXCRI_LOG_LEVEL"); ok { + logLevel = val + } + if val, ok := os.LookupEnv("LXCRI_LIBEXEC"); ok { + libexecDir = val + } + if val, ok := os.LookupEnv("HOME"); ok { + tmpRoot = val + } +} + +func mkdirTemp() (string, error) { + // /tmp has permissions 1777 + // it should never be used as runtime or rootfs parent + return os.MkdirTemp(tmpRoot, "lxcri-test") +} + +func newRuntime(t *testing.T) *Runtime { + runtimeRoot, err := mkdirTemp() + require.NoError(t, err) + t.Logf("runtime root: %s", runtimeRoot) + + err = unix.Chmod(runtimeRoot, 0755) + require.NoError(t, err) + + level, err := log.ParseLevel(logLevel) + require.NoError(t, err) + + rt := &Runtime{ + Log: log.ConsoleLogger(true, level), + Root: runtimeRoot, + LibexecDir: libexecDir, + //MonitorCgroup: "lxcri-monitor.slice", + } + + require.NoError(t, rt.Init()) + return rt +} + +// NOTE a container that was created successfully must always be +// deleted, otherwise the go test runner will hang because it waits +// for the container process to exit. +func newConfig(t *testing.T, cmd string, args ...string) *ContainerConfig { + rootfs, err := mkdirTemp() + require.NoError(t, err) + t.Logf("container rootfs: %s", rootfs) + + // copy test binary to rootfs + //err = exec.Command("cp", cmd, rootfs).Run() + //require.NoError(t, err) + + level, err := log.ParseLevel(logLevel) + require.NoError(t, err) + + cmdAbs, err := filepath.Abs(cmd) + require.NoError(t, err) + cmdDest := "/" + filepath.Base(cmdAbs) + + spec := specki.NewSpec(rootfs, cmdDest) + id := filepath.Base(rootfs) + cfg := ContainerConfig{ + ContainerID: id, Spec: spec, + Log: log.ConsoleLogger(true, level), + } + cfg.Spec.Linux.CgroupsPath = id + ".slice" // use /proc/self/cgroup" + + cfg.Spec.Mounts = append(cfg.Spec.Mounts, + specki.BindMount(cmdAbs, cmdDest), + ) + + // FIXME /dev/stderr has perms 600 + // If container process user is not equal to the + // runtime process user then setting lxc log file will fail + // because of missing permissions. + if runAsRuntimeUser(cfg.Spec) { + cfg.LogFile = "/dev/stderr" + } else { + cfg.LogFile = filepath.Join("/tmp", "log") + } + t.Logf("liblxc log output is written to %s", cfg.LogFile) + cfg.LogLevel = logLevel + + return &cfg +} + +func TestEmptyNamespaces(t *testing.T) { + t.Parallel() + rt := newRuntime(t) + defer os.RemoveAll(rt.Root) + + cfg := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + + // Clearing all namespaces should not work, + // since the mount namespace must never be shared with the host. + cfg.Spec.Linux.Namespaces = cfg.Spec.Linux.Namespaces[0:0] + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) + defer cancel() + + c, err := rt.Create(ctx, cfg) + require.Error(t, err) + t.Logf("create error: %s", err) + require.Nil(t, c) +} + +func TestSharedPIDNamespace(t *testing.T) { + t.Parallel() + if os.Getuid() != 0 { + t.Skipf("PID namespace sharing is only permitted as root.") + } + rt := newRuntime(t) + defer os.RemoveAll(rt.Root) + + cfg := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) + defer cancel() + + pidns := specs.LinuxNamespace{ + Type: specs.PIDNamespace, + Path: fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()), + } + + for i, ns := range cfg.Spec.Linux.Namespaces { + if ns.Type == specs.PIDNamespace { + cfg.Spec.Linux.Namespaces[i] = pidns + } + } + + c, err := rt.Create(ctx, cfg) + require.NoError(t, err) + require.NotNil(t, c) + + err = c.Release() + require.NoError(t, err) + + err = rt.Delete(ctx, c.ContainerID, true) + require.NoError(t, err) +} + +// TODO test uts namespace (shared with host) + +// NOTE works only if cgroup root is writable +// sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup/$(cat /proc/self/cgroup | grep '^0:' | cut -d: -f3) +func TestNonEmptyCgroup(t *testing.T) { + t.Parallel() + rt := newRuntime(t) + defer os.RemoveAll(rt.Root) + + cfg := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + + if os.Getuid() != 0 { + cfg.Spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: uint32(os.Getuid()), Size: 1}, + } + cfg.Spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: uint32(os.Getgid()), Size: 1}, + } + } + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) + defer cancel() + + c, err := rt.Create(ctx, cfg) + require.NoError(t, err) + require.NotNil(t, c) + + //t.Logf("sleeping for a minute") + //time.Sleep(60*time.Second) + + cfg2 := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + cfg2.Spec.Linux.CgroupsPath = cfg.Spec.Linux.CgroupsPath + + if os.Getuid() != 0 { + cfg2.Spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: uint32(os.Getuid()), Size: 1}, + } + cfg2.Spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: uint32(os.Getgid()), Size: 1}, + } + } + + c2, err := rt.Create(ctx, cfg2) + require.Error(t, err) + t.Logf("create error: %s", err) + + err = c.Release() + require.NoError(t, err) + + err = rt.Delete(ctx, c.ContainerID, true) + require.NoError(t, err) + + err = c2.Release() + require.NoError(t, err) + + require.NotNil(t, c2) + err = rt.Delete(ctx, c2.ContainerID, true) + require.NoError(t, err) +} + +func TestRuntimePrivileged(t *testing.T) { + t.Parallel() + if os.Getuid() != 0 { + t.Skipf("This tests only runs as root") + } + + rt := newRuntime(t) + defer os.RemoveAll(rt.Root) + + cfg := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + + testRuntime(t, rt, cfg) +} + +// The following tests require the following setup: + +// sudo /bin/sh -c "echo '$(whoami):1000:1' >> /etc/subuid" +// sudo /bin/sh -c "echo '$(whoami):20000:65536' >> /etc/subuid" +// sudo /bin/sh -c "echo '$(whoami):1000:1' >> /etc/subgid" +// sudo /bin/sh -c "echo '$(whoami):20000:65536' >> /etc/subgid" +// sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup/unified$(cat /proc/self/cgroup | grep '^0:' | cut -d: -f3) +// sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup$(cat /proc/self/cgroup | grep '^0:' | cut -d: -f3) +// +func TestRuntimeUnprivileged(t *testing.T) { + t.Parallel() + if os.Getuid() == 0 { + t.Skipf("This test only runs as non-root") + } + + rt := newRuntime(t) + defer os.RemoveAll(rt.Root) + + cfg := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + + // The container UID must have full access to the rootfs. + // MkdirTemp sets directory permissions to 0700. + // If we the container UID (0) / or GID are not mapped to the owner (creator) of the rootfs, + // then the rootfs and runtime directory permissions must be expanded. + + err := unix.Chmod(cfg.Spec.Root.Path, 0777) + require.NoError(t, err) + + err = unix.Chmod(rt.Root, 0755) + require.NoError(t, err) + + cfg.Spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: 20000, Size: 65536}, + } + cfg.Spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: 20000, Size: 65536}, + } + + testRuntime(t, rt, cfg) +} + +func TestRuntimeUnprivileged2(t *testing.T) { + t.Parallel() + rt := newRuntime(t) + defer os.RemoveAll(rt.Root) + + cfg := newConfig(t, "lxcri-test") + defer os.RemoveAll(cfg.Spec.Root.Path) + + if os.Getuid() != 0 { + cfg.Spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: uint32(os.Getuid()), Size: 1}, + //specs.LinuxIDMapping{ContainerID: 1, HostID: 20000, Size: 65536}, + } + cfg.Spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + specs.LinuxIDMapping{ContainerID: 0, HostID: uint32(os.Getgid()), Size: 1}, + //specs.LinuxIDMapping{ContainerID: 1, HostID: 20000, Size: 65536}, + } + } + + testRuntime(t, rt, cfg) +} + +func testRuntime(t *testing.T, rt *Runtime, cfg *ContainerConfig) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) + defer cancel() + + c, err := rt.Create(ctx, cfg) + require.NoError(t, err) + require.NotNil(t, c) + + state, err := c.State() + require.NoError(t, err) + require.Equal(t, specs.StateCreated, state.SpecState.Status) + + err = rt.Start(ctx, c) + require.NoError(t, err) + + state, err = c.State() + require.NoError(t, err) + require.Equal(t, specs.StateRunning, state.SpecState.Status) + + err = rt.Kill(ctx, c, unix.SIGUSR1) + require.NoError(t, err) + + state, err = c.State() + require.NoError(t, err) + require.Equal(t, specs.StateRunning, state.SpecState.Status) + + err = rt.Delete(ctx, c.ContainerID, true) + require.NoError(t, err) + + state, err = c.State() + require.NoError(t, err) + require.Equal(t, specs.StateStopped, state.SpecState.Status) + + err = c.Release() + require.NoError(t, err) +} diff --git a/seccomp.go b/seccomp.go new file mode 100644 index 00000000..a1630b18 --- /dev/null +++ b/seccomp.go @@ -0,0 +1,134 @@ +package lxcri + +import ( + "bufio" + "fmt" + "os" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +var seccompAction = map[specs.LinuxSeccompAction]string{ + specs.ActKill: "kill", + specs.ActTrap: "trap", + specs.ActErrno: "errno", + specs.ActAllow: "allow", + //specs.ActTrace: "trace", + //specs.ActLog: "log", + //specs.ActKillProcess: "kill_process", +} + +// Note seccomp flags (see `man 2 seccomp`) are currently not supported +// https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config-linux.md#seccomp +func writeSeccompProfile(profilePath string, seccomp *specs.LinuxSeccomp) error { + // #nosec + profile, err := os.OpenFile(profilePath, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0440) + if err != nil { + return err + } + // #nosec + defer profile.Close() + + w := bufio.NewWriter(profile) + + // #nosec + w.WriteString("2\n") + + action, err := defaultAction(seccomp) + if err != nil { + return err + } + fmt.Fprintf(w, "allowlist %s\n", action) + + platformArchs, err := seccompArchs(seccomp) + if err != nil { + return fmt.Errorf("failed to detect platform architecture: %w", err) + } + for _, arch := range platformArchs { + fmt.Fprintf(w, "[%s]\n", arch) + for _, sc := range seccomp.Syscalls { + if err := writeSeccompSyscall(w, sc); err != nil { + return err + } + } + } + // ensure profile is written to disk without errors + if err := w.Flush(); err != nil { + return err + } + return profile.Sync() +} + +func defaultAction(seccomp *specs.LinuxSeccomp) (string, error) { + switch seccomp.DefaultAction { + case specs.ActKill: + return "kill", nil + case specs.ActTrap: + return "trap", nil + case specs.ActErrno: + return "errno 0", nil + case specs.ActAllow: + return "allow", nil + case specs.ActTrace, specs.ActLog: // Not (yet) supported by lxc + fallthrough + //case specs.ActKillProcess: fallthrough // specs > 1.0.2 + default: + return "kill", fmt.Errorf("unsupported seccomp default action %q", seccomp.DefaultAction) + } +} + +func seccompArchs(seccomp *specs.LinuxSeccomp) ([]string, error) { + var uts unix.Utsname + if err := unix.Uname(&uts); err != nil { + return nil, err + } + nativeArch := nullTerminatedString(uts.Machine[:]) + if len(seccomp.Architectures) == 0 { + return []string{nativeArch}, nil + } + archs := make([]string, len(seccomp.Architectures)) + for _, a := range seccomp.Architectures { + s := strings.ToLower(strings.TrimPrefix(string(a), "SCMP_ARCH_")) + if strings.ToLower(nativeArch) == s { + // lxc seccomp code automatically adds syscalls to compat architectures + return []string{nativeArch}, nil + } + archs = append(archs, s) + } + return archs, nil +} + +func writeSeccompSyscall(w *bufio.Writer, sc specs.LinuxSyscall) error { + for _, name := range sc.Names { + action, ok := seccompAction[sc.Action] + if !ok { + return fmt.Errorf("unsupported seccomp action: %s", sc.Action) + } + + if sc.Action == specs.ActErrno { + var ret uint = 0 + if sc.ErrnoRet != nil { + ret = *sc.ErrnoRet + } + action = fmt.Sprintf("%s %d", action, ret) + } + + if len(sc.Args) == 0 { + fmt.Fprintf(w, "%s %s\n", name, action) + } else { + // Only write a single argument per line - this is required when the same arg.Index is used multiple times. + // from `man 7 seccomp_rule_add_exact_array` + // "When adding syscall argument comparisons to the filter it is important to remember + // that while it is possible to have multiple comparisons in a single rule, + // you can only compare each argument once in a single rule. + // In other words, you can not have multiple comparisons of the 3rd syscall argument in a single rule." + for _, arg := range sc.Args { + fmt.Fprintf(w, "%s %s [%d,%d,%s,%d]\n", name, action, arg.Index, arg.Value, arg.Op, arg.ValueTwo) + } + } + } + return nil +} diff --git a/test/.gitignore b/test/.gitignore deleted file mode 100644 index c272491c..00000000 --- a/test/.gitignore +++ /dev/null @@ -1 +0,0 @@ -oci-cache diff --git a/test/basic-container-config.json b/test/basic-container-config.json deleted file mode 100644 index 5a68f41f..00000000 --- a/test/basic-container-config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "metadata": { - "name": "busybox" - }, - "image":{ - "image": "busybox" - }, - "command": [ - "top" - ], - "log_path":"busybox/0.log", - "linux": { - } -} diff --git a/test/basic-pod-config.json b/test/basic-pod-config.json deleted file mode 100644 index e7ccc473..00000000 --- a/test/basic-pod-config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "metadata": { - "name": "nginx-sandbox", - "namespace": "default", - "attempt": 1, - "uid": "hdishd83djaidwnduwk28bcsb" - }, - "log_directory": "/tmp", - "linux": { - } -} diff --git a/test/basic.bats b/test/basic.bats deleted file mode 100644 index 70e93176..00000000 --- a/test/basic.bats +++ /dev/null @@ -1,20 +0,0 @@ -load helpers - -function setup() { - setup_crio -} - -function teardown() { - cleanup_crio -} - -@test "basic cri-o workings" { - crictl runp test/basic-pod-config.json - crictl pull busybox - crictl images - podid=$(crictl pods | grep nginx-sandbox | awk '{ print $1 }') - crictl create $podid test/basic-container-config.json test/basic-pod-config.json - crictl ps -a | grep busybox - crictl stopp $podid - crictl rmp $podid -} diff --git a/test/crio.conf.in b/test/crio.conf.in deleted file mode 100644 index eb0c7f89..00000000 --- a/test/crio.conf.in +++ /dev/null @@ -1,275 +0,0 @@ -# The CRI-O configuration file specifies all of the available configuration -# options and command-line flags for the crio(8) OCI Kubernetes Container Runtime -# daemon, but in a TOML format that can be more easily modified and versioned. -# -# Please refer to crio.conf(5) for details of all configuration options. - -# CRI-O reads its storage defaults from the containers-storage.conf(5) file -# located at /etc/containers/storage.conf. Modify this storage configuration if -# you want to change the system's defaults. If you want to modify storage just -# for CRI-O, you can change the storage configuration options here. -[crio] - -# Path to the "root directory". CRI-O stores all of its data, including -# containers images, in this directory. -root = "CRIOLXC_TEST_DIR/crio-root" - -# Path to the "run directory". CRI-O stores all of its state in this directory. -#runroot = "/tmp/1000" - -# Storage driver used to manage the storage of images and containers. Please -# refer to containers-storage.conf(5) to see all available storage drivers. -#storage_driver = "vfs" - -# List to pass options to the storage driver. Please refer to -# containers-storage.conf(5) to see all available storage options. -#storage_option = [ -#] - -# If set to false, in-memory locking will be used instead of file-based locking. -file_locking = true - -# Path to the lock file. -file_locking_path = "CRIOLXC_TEST_DIR/crio.lock" - - -# The crio.api table contains settings for the kubelet/gRPC interface. -[crio.api] - -# Path to AF_LOCAL socket on which CRI-O will listen. -listen = "CRIOLXC_TEST_DIR/crio.sock" - -# IP address on which the stream server will listen. -stream_address = "127.0.0.1" - -# The port on which the stream server will listen. -stream_port = "0" - -# Enable encrypted TLS transport of the stream server. -stream_enable_tls = false - -# Path to the x509 certificate file used to serve the encrypted stream. This -# file can change, and CRI-O will automatically pick up the changes within 5 -# minutes. -stream_tls_cert = "" - -# Path to the key file used to serve the encrypted stream. This file can -# change, and CRI-O will automatically pick up the changes within 5 minutes. -stream_tls_key = "" - -# Path to the x509 CA(s) file used to verify and authenticate client -# communication with the encrypted stream. This file can change, and CRI-O will -# automatically pick up the changes within 5 minutes. -stream_tls_ca = "" - -# Maximum grpc send message size in bytes. If not set or <=0, then CRI-O will default to 16 * 1024 * 1024. -grpc_max_send_msg_size = 16777216 - -# Maximum grpc receive message size. If not set or <= 0, then CRI-O will default to 16 * 1024 * 1024. -grpc_max_recv_msg_size = 16777216 - -# The crio.runtime table contains settings pertaining to the OCI runtime used -# and options for how to set up and manage the OCI runtime. -[crio.runtime] - -# A list of ulimits to be set in containers by default, specified as -# "=:", for example: -# "nofile=1024:2048" -# If nothing is set here, settings will be inherited from the CRI-O daemon -#default_ulimits = [ -#] - -# default_runtime is the _name_ of the OCI runtime to be used as the default. -# The name is matched against the runtimes map below. -default_runtime = "runc" - -# If true, the runtime will not use pivot_root, but instead use MS_MOVE. -no_pivot = false - -# Path to the conmon binary, used for monitoring the OCI runtime. -conmon = "PACKAGES_DIR/conmon/bin/conmon" - -# Path to pinns. -pinns_path = "PACKAGES_DIR/cri-o/bin/pinns" - -# Environment variable list for the conmon process, used for passing necessary -# environment variables to conmon or the runtime. -conmon_env = [ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", -] - -# If true, SELinux will be used for pod separation on the host. -selinux = false - -# Path to the seccomp.json profile which is used as the default seccomp profile -# for the runtime. -seccomp_profile = "CRIOLXC_TEST_DIR/seccomp.json" - -# Used to change the name of the default AppArmor profile of CRI-O. The default -# profile name is "crio-default-" followed by the version string of CRI-O. -apparmor_profile = "unconfined" - -# Cgroup management implementation used for the runtime. -cgroup_manager = "cgroupfs" - -# List of default capabilities for containers. If it is empty or commented out, -# only the capabilities defined in the containers json file by the user/kube -# will be added. -default_capabilities = [ - "CHOWN", - "DAC_OVERRIDE", - "FSETID", - "FOWNER", - "NET_RAW", - "SETGID", - "SETUID", - "SETPCAP", - "NET_BIND_SERVICE", - "SYS_CHROOT", - "KILL", -] - -# List of default sysctls. If it is empty or commented out, only the sysctls -# defined in the container json file by the user/kube will be added. -default_sysctls = [ -] - -# List of additional devices. specified as -# "::", for example: "--device=/dev/sdc:/dev/xvdc:rwm". -#If it is empty or commented out, only the devices -# defined in the container json file by the user/kube will be added. -additional_devices = [ -] - -# Path to OCI hooks directories for automatically executed hooks. -hooks_dir = [ -] - -# List of default mounts for each container. **Deprecated:** this option will -# be removed in future versions in favor of default_mounts_file. -default_mounts = [ -] - -# Path to the file specifying the defaults mounts for each container. The -# format of the config is /SRC:/DST, one mount per line. Notice that CRI-O reads -# its default mounts from the following two files: -# -# 1) /etc/containers/mounts.conf (i.e., default_mounts_file): This is the -# override file, where users can either add in their own default mounts, or -# override the default mounts shipped with the package. -# -# 2) /usr/share/containers/mounts.conf: This is the default file read for -# mounts. If you want CRI-O to read from a different, specific mounts file, -# you can change the default_mounts_file. Note, if this is done, CRI-O will -# only add mounts it finds in this file. -# -#default_mounts_file = "" - -# Maximum number of processes allowed in a container. -pids_limit = 1024 - -# Maximum sized allowed for the container log file. Negative numbers indicate -# that no size limit is imposed. If it is positive, it must be >= 8192 to -# match/exceed conmon's read buffer. The file is truncated and re-opened so the -# limit is never exceeded. -log_size_max = -1 - -# Whether container output should be logged to journald in addition to the kuberentes log file -log_to_journald = false - -# Path to directory in which container exit files are written to by conmon. -container_exits_dir = "CRIOLXC_TEST_DIR/exits" - -# Path to directory for container attach sockets. -container_attach_socket_dir = "CRIOLXC_TEST_DIR/attach-sockets" - -# If set to true, all containers will run in read-only mode. -read_only = false - -# Changes the verbosity of the logs based on the level it is set to. Options -# are fatal, panic, error, warn, info, and debug. -log_level = "info" - -# The UID mappings for the user namespace of each container. A range is -# specified in the form containerUID:HostUID:Size. Multiple ranges must be -# separated by comma. -uid_mappings = "" - -# The GID mappings for the user namespace of each container. A range is -# specified in the form containerGID:HostGID:Size. Multiple ranges must be -# separated by comma. -gid_mappings = "" - -# The minimal amount of time in seconds to wait before issuing a timeout -# regarding the proper termination of the container. -ctr_stop_timeout = 0 - -# ManageNetworkNSLifecycle determines whether we pin and remove network namespace -# and manage its lifecycle. -manage_network_ns_lifecycle = false - -# The "crio.runtime.runtimes" table defines a list of OCI compatible runtimes. -# The runtime to use is picked based on the runtime_handler provided by the CRI. -# If no runtime_handler is provided, the runtime will be picked based on the level -# of trust of the workload. - -[crio.runtime.runtimes.runc] -runtime_path = "CRIOLXC_BINARY" -runtime_type = "oci" -runtime_root = "CRIOLXC_TEST_DIR/runtime-root" - - -# The crio.image table contains settings pertaining to the management of OCI images. -# -# CRI-O reads its configured registries defaults from the system wide -# containers-registries.conf(5) located in /etc/containers/registries.conf. If -# you want to modify just CRI-O, you can change the registries configuration in -# this file. Otherwise, leave insecure_registries and registries commented out to -# use the system's defaults from /etc/containers/registries.conf. -[crio.image] - -# Default transport for pulling images from a remote container storage. -default_transport = "docker://" - -# The image used to instantiate infra containers. -pause_image = "k8s.gcr.io/pause:3.1" - -# If not empty, the path to a docker/config.json-like file containing credentials -# necessary for pulling the image specified by pause_imageĀ above. -pause_image_auth_file = "" - -# The command to run to have a container stay in the paused state. -pause_command = "/pause" - -# Path to the file which decides what sort of policy we use when deciding -# whether or not to trust an image that we've pulled. It is not recommended that -# this option be used, as the default behavior of using the system-wide default -# policy (i.e., /etc/containers/policy.json) is most often preferred. Please -# refer to containers-policy.json(5) for more details. -signature_policy = "CRIOLXC_TEST_DIR/policy.json" - -# Controls how image volumes are handled. The valid values are mkdir, bind and -# ignore; the latter will ignore volumes entirely. -image_volumes = "mkdir" - -# List of registries to be used when pulling an unqualified image (e.g., -# "alpine:latest"). By default, registries is set to "docker.io" for -# compatibility reasons. Depending on your workload and usecase you may add more -# registries (e.g., "quay.io", "registry.fedoraproject.org", -# "registry.opensuse.org", etc.). -registries = [ - "docker.io", -] - - -# The crio.network table containers settings pertaining to the management of -# CNI plugins. -[crio.network] - -# Path to the directory where CNI configuration files are located. -network_dir = "CRIOLXC_TEST_DIR/cni/net.d" - -# Paths to directories where CNI plugin binaries are located. -plugin_dirs = [ - "CRIOLXC_TEST_DIR/cni-plugins/bin", -] diff --git a/test/helpers.bash b/test/helpers.bash deleted file mode 100644 index 2bba1b2b..00000000 --- a/test/helpers.bash +++ /dev/null @@ -1,64 +0,0 @@ -ROOT_DIR=$(git rev-parse --show-toplevel) - -function make_tempdir { - declare -g TEMP_DIR=$(realpath $(mktemp -d crio-lxc-test.XXXXXXXX)) - # not strictly necessary, but nice if we end up debugging things by keeping - # the tempdir around - chmod 755 "$TEMP_DIR" -} - -function setup_crio { - make_tempdir - sed \ - -e "s,CRIOLXC_TEST_DIR,$TEMP_DIR,g" \ - -e "s,CRIOLXC_BINARY,$ROOT_DIR/crio-lxc,g" \ - -e "s,PACKAGES_DIR,$PACKAGES_DIR,g" \ - "$ROOT_DIR/test/crio.conf.in" > "$TEMP_DIR/crio.conf" - # it doesn't like seccomp_profile = "", so let's make a bogus one - echo "{}" > "$TEMP_DIR/seccomp.json" - # It doesn't like if these dirs don't exist, so always them - # You can't start a pod without them, so if you're going to test - # basic.bats, then - # cd ~/packages; git clone https://github.com/containernetworking/cni - # git clone https://github.com/containernetworking/plugins cni-plugins - # cd cni-plugins; ./build_linux.sh - mkdir -p "$TEMP_DIR/cni/net.d" - mkdir -p /tmp/busybox # for the logfile as per log_directory in test/basic-pod-config.json - if [ -d ~/packages/cni-plugins ]; then - rsync -a ~/packages/cni-plugins $TEMP_DIR/ - cat > $TEMP_DIR/cni/net.d/10-myptp.conf << EOF -{"cniVersion":"0.3.1","name":"myptp","type":"ptp","ipMasq":true,"ipam":{"type":"host-local","subnet":"172.16.29.0/24","routes":[{"dst":"0.0.0.0/0"}]}} -EOF - else - mkdir -p "$TEMP_DIR/cni-plugins" - fi - # set up an insecure policy - echo '{"default": [{"type": "insecureAcceptAnything"}]}' > "$TEMP_DIR/policy.json" - "$PACKAGES_DIR/cri-o/bin/crio" --config "$TEMP_DIR/crio.conf" & - declare -g CRIO_PID=$! -} - -function cleanup_crio { - kill -SIGTERM $CRIO_PID || true - # wait until it dies; it has a bunch of stuff mounted, and we'll get - # various EBUSY races if we don't - wait $CRIO_PID - cleanup_tempdir -} - -function cleanup_tempdir { - [ -f .keeptempdirs ] || rm -rf "$TEMP_DIR" || true -} - -function crictl { - # watch out for: https://github.com/kubernetes-sigs/cri-tools/issues/460 - # If you need more debug output, set CRICTLDEBUG to -D - CRICTLDEBUG="" - $(which crictl) ${CRICTLDEBUG} --runtime-endpoint "$TEMP_DIR/crio.sock" $@ - echo "$output" -} - -function crio-lxc { - $ROOT_DIR/crio-lxc --lxc-path "$TEMP_DIR/lxcpath" $@ - echo "$output" -} diff --git a/test/manual.bats b/test/manual.bats deleted file mode 100644 index c4f9ae0f..00000000 --- a/test/manual.bats +++ /dev/null @@ -1,34 +0,0 @@ -load helpers - -function setup() { - make_tempdir - skopeo --insecure-policy copy docker://alpine:latest oci:$ROOT_DIR/test/oci-cache:alpine - umoci unpack --image "$ROOT_DIR/test/oci-cache:alpine" "$TEMP_DIR/dest" - sed -i -e "s?rootfs?$TEMP_DIR/dest/rootfs?" "$TEMP_DIR/dest/config.json" - sed -i -e "s?\"/bin/sh\"?\"/bin/sleep\",\n\"10\"?" "$TEMP_DIR/dest/config.json" - sed -i -e "s?\"type\": \"ipc\"?\"type\": \"ipc\",\n\"path\": \"/proc/1/ns/ipc\"?" "$TEMP_DIR/dest/config.json" - -} - -function teardown() { - cleanup_tempdir -} - -@test "manual invocation" { - crio-lxc --debug --log-level trace --log-file "$TEMP_DIR/log" create --bundle "$TEMP_DIR/dest" --pid-file "$TEMP_DIR/pid" alpine - - status=$(crio-lxc --debug --log-level trace --log-file "$TEMP_DIR/log" state alpine | jq -r .status) - [ $status = "created" ] - - crio-lxc --debug --log-level trace --log-file "$TEMP_DIR/log" start alpine - - status=$(crio-lxc --debug --log-level trace --log-file "$TEMP_DIR/log" state alpine | jq -r .status) - [ $status = "running" ] - - pid1ipcnsinode=$(stat -L -c%i /proc/1/ns/ipc) - mypid=$(<"$TEMP_DIR/pid") - mypidipcnsinode=$(stat -L -c%i "/proc/$mypid/ns/ipc") - [ $pid1ipcnsinode = $mypidipcnsinode ] - crio-lxc --debug --log-level trace --log-file "$TEMP_DIR/log" kill alpine - crio-lxc --debug --log-level trace --log-file "$TEMP_DIR/log" delete alpine -} diff --git a/utils.go b/utils.go new file mode 100644 index 00000000..f5c961d9 --- /dev/null +++ b/utils.go @@ -0,0 +1,61 @@ +package lxcri + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "runtime" + + "golang.org/x/sys/unix" +) + +func canExecute(cmds ...string) error { + for _, c := range cmds { + if err := unix.Access(c, unix.X_OK); err != nil { + return fmt.Errorf("can not execute %q: %w", c, err) + } + } + return nil +} + +func fsMagic(fsName string) int64 { + switch fsName { + case "proc", "procfs": + return unix.PROC_SUPER_MAGIC + case "cgroup2", "cgroup2fs": + return unix.CGROUP2_SUPER_MAGIC + default: + return -1 + } +} + +// TODO check whether dir is the filsystem root (use /proc/mounts) +func isFilesystem(dir string, fsName string) error { + fsType := fsMagic(fsName) + if fsType == -1 { + return fmt.Errorf("undefined filesystem %q", fsName) + } + + var stat unix.Statfs_t + err := unix.Statfs(dir, &stat) + if err != nil { + return fmt.Errorf("fstat failed for %q: %w", dir, err) + } + if stat.Type != fsType { + return fmt.Errorf("%q is not on filesystem %s", dir, fsName) + } + return nil +} + +func nullTerminatedString(data []byte) string { + i := bytes.Index(data, []byte{0}) + return string(data[:i]) +} + +func errorf(sfmt string, args ...interface{}) error { + bin := filepath.Base(os.Args[0]) + _, file, line, _ := runtime.Caller(1) + prefix := fmt.Sprintf("[%s:%s:%d] ", bin, filepath.Base(file), line) + return fmt.Errorf(prefix+sfmt, args...) +}