From 2e251180df671149887e2e6506848c4d27d5771e Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Wed, 18 Sep 2024 18:13:42 +0000 Subject: [PATCH] Dont ignore failure to create cgroup after timeout Before this commit, creating a cgroup would silently ignore timeouts and carry on. Concretely, this caused cases where a cgroup failed to create, but the caller doesn't realize and ends up looking for files that should exist (e.g. cgroups.controllers), only to find they don't exist. It's very difficult as a caller to deal with this case, where NewSystemd succeeds but the group doesn't exist. The origins of this code seem to trace back to an initial implementation written 5+ years ago: https://github.com/containerd/cgroups/commit/5efa14e3dbc5c34d93de6c706af24fea4a30de46#diff-3331981e4ac06a8d9b06e91842b7f2759c7af3b65287e489a88385948d311ebdR672 runc added roughly the same logic here to deal with the same issue: https://github.com/opencontainers/runc/pull/3782 Now, containerd will also error if a cgroup cannot be created within the timeout window. Signed-off-by: Josh Chorlton --- cgroup2/manager.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cgroup2/manager.go b/cgroup2/manager.go index 0305471..d579cda 100644 --- a/cgroup2/manager.go +++ b/cgroup2/manager.go @@ -952,14 +952,16 @@ func startUnit(conn *systemdDbus.Conn, group string, properties []systemdDbus.Pr } } + systemdStartUnitTimeout := 30 * time.Second select { case s := <-statusChan: if s != "done" { attemptFailedUnitReset(conn, group) return fmt.Errorf("error creating systemd unit `%s`: got `%s`", group, s) } - case <-time.After(30 * time.Second): - log.G(ctx).Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group) + case <-time.After(systemdStartUnitTimeout): + attemptFailedUnitReset(conn, group) + return fmt.Errorf("timed out while waiting for StartTransientUnit(%s) completion signal from dbus after %v", group, systemdStartUnitTimeout) } return nil