diff --git a/k8s/lib.sh b/k8s/lib.sh index 68f9d1364..29aa0d905 100755 --- a/k8s/lib.sh +++ b/k8s/lib.sh @@ -101,8 +101,16 @@ k8s::remove::containerd() { # this is to prevent removing containerd when it is not installed by the snap. # NOTE: do NOT include .containerd-base-dir! By default, it will contain "/". for file in "containerd-socket-path" "containerd-config-dir" "containerd-root-dir" "containerd-cni-bin-dir"; do - if [ -f "$SNAP_COMMON/lock/$file" ]; then - rm -rf $(cat "$SNAP_COMMON/lock/$file") + local lockpath="$SNAP_COMMON/lock/$file" + if [ -f "$lockpath" ]; then + local dirpath=$(cat "$SNAP_COMMON/lock/$file") + + if [ $(readlink -m "$dirpath") = "/" ]; then + echo "WARN: lockfile '$lockpath' points to root ('/'). Skipping cleanup." + continue + fi + + rm -rf "$dirpath" fi done } diff --git a/src/k8s/pkg/k8sd/app/hooks_remove.go b/src/k8s/pkg/k8sd/app/hooks_remove.go index 6b0db3ecd..4e90e6c05 100644 --- a/src/k8s/pkg/k8sd/app/hooks_remove.go +++ b/src/k8s/pkg/k8sd/app/hooks_remove.go @@ -13,6 +13,7 @@ import ( "github.com/canonical/k8s/pkg/k8sd/pki" "github.com/canonical/k8s/pkg/k8sd/setup" "github.com/canonical/k8s/pkg/log" + "github.com/canonical/k8s/pkg/snap" snaputil "github.com/canonical/k8s/pkg/snap/util" "github.com/canonical/k8s/pkg/utils/control" "github.com/canonical/microcluster/v2/cluster" @@ -145,5 +146,63 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr } } + tryCleanupContainerdPaths(log, snap) + return nil } + +// tryCleanupContainerdPaths attempts to clean up all containerd directories which were +// created by the k8s-snap based on the existence of their respective lockfiles +// located in the directory returned by `s.LockFilesDir()`. +func tryCleanupContainerdPaths(log log.Logger, s snap.Snap) { + for lockpath, dirpath := range setup.ContainerdLockPathsForSnap(s) { + // Ensure lockfile exists: + if _, err := os.Stat(lockpath); os.IsNotExist(err) { + log.Info("WARN: failed to find containerd lockfile, no cleanup will be perfomed", "lockfile", lockpath, "directory", dirpath) + continue + } + + // Ensure lockfile's contents is the one we expect: + lockfile_contents := "" + if contents, err := os.ReadFile(lockpath); err != nil { + log.Info("WARN: failed to read contents of lockfile", "lockfile", lockpath, "error", err) + continue + } else { + lockfile_contents = string(contents) + } + + if lockfile_contents != dirpath { + log.Info("WARN: lockfile points to different path than expected", "lockfile", lockpath, "expected", dirpath, "actual", lockfile_contents) + continue + } + + // Check directory exists before attempting to remove: + if _, err := os.Stat(dirpath); os.IsNotExist(err) { + log.Info("Containerd directory doesn't exist; skipping cleanup", "directory", dirpath) + } else { + // NOTE(aznashwan): because of the convoluted interfaces-based way the snap + // composes and creates the original lockfiles (see k8sd/setup/containerd.go) + // this check is meant to defend against accidental code/configuration errors which + // might lead to the root FS being deleted: + realPath, err := os.Readlink(dirpath) + if err != nil { + log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath)) + continue + } + + if realPath == "/" { + log.Error(fmt.Errorf("There is some configuration/logic error in the current versions of the k8s-snap related to lockfile %q (meant to lock %q, which points to %q) which could lead to accidental wiping of the root file system.", lockpath, dirpath, realPath), "Please report this issue upstream immediately.") + continue + } + + if err := os.RemoveAll(dirpath); err != nil { + log.Info("WARN: failed to remove containerd data directory", "directory", dirpath, "error", err, "realPath", realPath) + continue // Avoid removing the lockfile path. + } + } + + if err := os.Remove(lockpath); err != nil { + log.Info("WARN: Failed to remove containerd lockfile", "lockfile", lockpath) + } + } +} diff --git a/src/k8s/pkg/k8sd/setup/containerd.go b/src/k8s/pkg/k8sd/setup/containerd.go index d6c6c8a10..2aba6b70c 100644 --- a/src/k8s/pkg/k8sd/setup/containerd.go +++ b/src/k8s/pkg/k8sd/setup/containerd.go @@ -166,21 +166,46 @@ func Containerd(snap snap.Snap, extraContainerdConfig map[string]any, extraArgs return nil } -func saveSnapContainerdPaths(s snap.Snap) error { - // Write the containerd-related paths to files to properly clean-up on removal. +// ContainerdLockPathsForSnap returns a mapping between the absolute paths of +// the lockfiles within the k8s snap and the absolute paths of the containerd +// directory they lock. +// +// WARN: these lockfiles are meant to be used in later cleanup stages. +// DO NOT include any system paths which are not managed by the k8s-snap! +// +// It intentionally does NOT include the containerd base dir lockfile +// (which most of the rest of the paths are based on), as it is meant +// to indicate the root of the containerd install ('/' or '/var/snap/k8s/*'). +func ContainerdLockPathsForSnap(s snap.Snap) map[string]string { m := map[string]string{ "containerd-socket-path": s.ContainerdSocketDir(), "containerd-config-dir": s.ContainerdConfigDir(), "containerd-root-dir": s.ContainerdRootDir(), "containerd-cni-bin-dir": s.CNIBinDir(), - snap.ContainerdBaseDir: s.GetContainerdBaseDir(), } - for filename, content := range m { - if err := utils.WriteFile(filepath.Join(s.LockFilesDir(), filename), []byte(content), 0o600); err != nil { - return fmt.Errorf("failed to write %s: %w", filename, err) + prefixed := map[string]string{} + for k, v := range m { + prefixed[filepath.Join(s.LockFilesDir(), k)] = v + } + + return prefixed +} + +// saveSnapContainerdPaths creates the lock files for the containerd directory paths to be used for later cleanup. +func saveSnapContainerdPaths(s snap.Snap) error { + for lockpath, dirpath := range ContainerdLockPathsForSnap(s) { + if err := utils.WriteFile(lockpath, []byte(dirpath), 0o600); err != nil { + return fmt.Errorf("failed to write %s: %w", lockpath, err) } } + + // Save the Containerd Base Dir separately: + baseDirPath := filepath.Join(s.LockFilesDir(), snap.ContainerdBaseDir) + if err := utils.WriteFile(baseDirPath, []byte(s.GetContainerdBaseDir()), 0o600); err != nil { + return fmt.Errorf("failed to write %s: %w", baseDirPath, err) + } + return nil } diff --git a/src/k8s/pkg/k8sd/setup/containerd_test.go b/src/k8s/pkg/k8sd/setup/containerd_test.go index 8bd56399b..538202dfd 100644 --- a/src/k8s/pkg/k8sd/setup/containerd_test.go +++ b/src/k8s/pkg/k8sd/setup/containerd_test.go @@ -31,6 +31,7 @@ func TestContainerd(t *testing.T) { ContainerdRegistryConfigDir: filepath.Join(dir, "containerd-hosts"), ContainerdStateDir: filepath.Join(dir, "containerd-state"), ContainerdExtraConfigDir: filepath.Join(dir, "containerd-confd"), + LockFilesDir: filepath.Join(dir, "lockfiles"), ServiceArgumentsDir: filepath.Join(dir, "args"), CNIBinDir: filepath.Join(dir, "opt-cni-bin"), CNIConfDir: filepath.Join(dir, "cni-netd"), @@ -129,9 +130,16 @@ func TestContainerd(t *testing.T) { "containerd-root-dir": s.ContainerdRootDir(), "containerd-cni-bin-dir": s.CNIBinDir(), } - for filename, content := range m { - b, err := os.ReadFile(filepath.Join(s.LockFilesDir(), filename)) + // Ensure locks directory exists: + files, err := os.ReadDir(s.LockFilesDir()) + g.Expect(err).To(Not(HaveOccurred())) + t.Logf("Lockfiles currently in %q: %v", s.LockFilesDir(), files) + + for filename, content := range m { + path := filepath.Join(s.LockFilesDir(), filename) + t.Logf("Checking path: %s", path) + b, err := os.ReadFile(path) g.Expect(err).To(Not(HaveOccurred())) g.Expect(string(b)).To(Equal(content)) } diff --git a/tests/integration/tests/test_cleanup.py b/tests/integration/tests/test_cleanup.py index ec54ebd49..9633fb518 100644 --- a/tests/integration/tests/test_cleanup.py +++ b/tests/integration/tests/test_cleanup.py @@ -11,6 +11,8 @@ LOG = logging.getLogger(__name__) +KUBE_CONTROLLER_MANAGER_SNAP_PORT = 10257 + CONTAINERD_PATHS = [ "/etc/containerd", "/run/containerd", @@ -19,9 +21,22 @@ CNI_PATH = "/opt/cni/bin" +def _assert_paths_not_exist(instance: harness.Instance, paths: List[str]): + paths_which_exist = [ + p + for p, exists in util.check_file_paths_exist(instance, paths).items() + if exists + ] + if paths_which_exist: + raise AssertionError( + f"Expected the following path(s) to not exist: {paths_which_exist}" + ) + + @pytest.mark.node_count(1) @pytest.mark.tags(tags.NIGHTLY) def test_node_cleanup(instances: List[harness.Instance], tmp_path): + """Verifies that a `snap remove k8s` will perform proper cleanup.""" instance = instances[0] util.wait_for_dns(instance) util.wait_for_network(instance) @@ -30,11 +45,7 @@ def test_node_cleanup(instances: List[harness.Instance], tmp_path): # Check that the containerd-related folders are removed on snap removal. all_paths = CONTAINERD_PATHS + [CNI_PATH] - process = instance.exec( - ["ls", *all_paths], capture_output=True, text=True, check=False - ) - for path in all_paths: - assert f"cannot access '{path}': No such file or directory" in process.stderr + _assert_paths_not_exist(instance, all_paths) util.setup_k8s_snap(instance, tmp_path) instance.exec(["k8s", "bootstrap"]) @@ -88,18 +99,60 @@ def test_node_cleanup_new_containerd_path(instances: List[harness.Instance]): assert ( f"cannot access '{path}': No such file or directory" in process.stderr ) + _assert_paths_not_exist(instance, exp_missing_paths) # Check that the containerd-related folders are in the new locations. # If one of them is missing, this should have a non-zero exit code. instance.exec(["ls", *new_containerd_paths], check=True) for instance in instances: - # Check that the containerd-related folders are not in the new locations after snap removal. util.remove_k8s_snap(instance) - process = instance.exec( - ["ls", *new_containerd_paths], capture_output=True, text=True, check=False + # Check that the containerd-related folders are not in the new locations after snap removal. + _assert_paths_not_exist(instance, new_containerd_paths) + + +@pytest.mark.node_count(1) +@pytest.mark.no_setup() +@pytest.mark.tags(tags.NIGHTLY) +def test_containerd_path_cleanup_on_failed_init( + instances: List[harness.Instance], tmp_path +): + """Tests that a failed `bootstrap` properly cleans up any + containerd-related paths it may have created as part of the + failed `bootstrap`. + + It induces a bootstrap failure by pre-binding a required k8s service + port (10257 for the kube-controller-manager) before running `k8s bootstrap`. + + NOTE: a failed `join-cluster` will trigger the exact same cleanup + hook, so the test implicitly applies to it as well. + """ + instance = instances[0] + expected_code = 1 + expected_message = ( + "Encountered error(s) while verifying port availability for Kubernetes " + "services: Port 10257 (needed by: kube-controller-manager) is already in use." + ) + + with util.open_port(KUBE_CONTROLLER_MANAGER_SNAP_PORT) as _: + util.setup_k8s_snap(instance, tmp_path, config.SNAP, connect_interfaces=False) + + proc = instance.exec( + ["k8s", "bootstrap"], capture_output=True, text=True, check=False ) - for path in new_containerd_paths: - assert ( - f"cannot access '{path}': No such file or directory" in process.stderr + + if proc.returncode != expected_code: + raise AssertionError( + f"Expected `k8s bootstrap` to exit with code {expected_code}, " + f"but it exited with {proc.returncode}.\n" + f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}" ) + + if expected_message not in proc.stderr: + raise AssertionError( + f"Expected to find port-related warning '{expected_message}' in " + "stderr of the `k8s bootstrap` command.\n" + f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}" + ) + + _assert_paths_not_exist(instance, CONTAINERD_PATHS) diff --git a/tests/integration/tests/test_util/util.py b/tests/integration/tests/test_util/util.py index 117c9ded0..f1d0d31b2 100644 --- a/tests/integration/tests/test_util/util.py +++ b/tests/integration/tests/test_util/util.py @@ -1,11 +1,13 @@ # # Copyright 2024 Canonical, Ltd. # +import contextlib import ipaddress import json import logging import re import shlex +import socket import subprocess import urllib.request from datetime import datetime @@ -514,3 +516,51 @@ def find_suitable_cidr(parent_cidr: str, excluded_ips: List[str]): return str(lb_net) raise RuntimeError("Could not find a suitable CIDR for LoadBalancer services") + + +@contextlib.contextmanager +def open_port( + port: int, + host: str = "", + address_family: socket.AddressFamily = socket.AF_INET, + socket_kind: socket.SocketKind = socket.SOCK_STREAM, + max_backlogged_connections: int = 0, +): + """Context manager which opens a socket with the given properties + and binds it to the given port. + + Yields the already setup and listening socket object for use. + + By default, it will only allow one single active connection + and instantly refuse any new ones. Use the `max_backlogged_connections` + argument if you'd like it to accept more connections as `pending`. + """ + sock = socket.socket(family=address_family, type=socket_kind) + if not host: + host = socket.gethostname() + sock.bind((host, port)) + LOG.info(f"Successfully bound new socket on '{host}:{port}'") + + try: + sock.listen(max_backlogged_connections) + LOG.info(f"Successfully started listening on '{host}:{port}'") + yield sock + finally: + sock.close() + LOG.info(f"Closed socket on '{host}:{port}'") + + +def check_file_paths_exist( + instance: harness.Instance, paths: List[str] +) -> Mapping[str, bool]: + """Returns whether the given path(s) exist within the given harness instance + by checking the output of a single `ls` command containing all of them. + + It is recommended to always use absolute paths, as the cwd relative to + which the `ls` will get executed depends on the harness instance. + """ + process = instance.exec(["ls", *paths], capture_output=True, text=True, check=False) + return { + p: not (f"cannot access '{p}': No such file or directory" in process.stderr) + for p in paths + }