diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 0b17168c2bb..920830664a6 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -560,7 +560,33 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { if err != nil { return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) } - exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) + + // TODO: After https://go-review.googlesource.com/c/go/+/515799 inclued + // in go versions supported by us, we can remove this logic and change + // it back to `exePath = "/proc/self/fd/" + strconv.Itoa(safeExe.Fd())`. + + // Due to a Go stdlib bug, we need to add safeExe to the set of + // ExtraFiles otherwise it is possible for the stdlib to clobber the fd + // during forkAndExecInChild1 and replace it with some other file that + // might be malicious. This is less than ideal (because the descriptor + // will be non-O_CLOEXEC) however we have protections in "runc init" to + // stop us from leaking extra file descriptors. + // + // See . + p.ExtraFiles = append(p.ExtraFiles, safeExe) + + // There is a race situation when we are opening a file, if there is a + // small fd was closed at that time, maybe it will be reused by safeExe. + // Because of Go stdlib fds shuffling bug, if the fd of safeExe is too + // small, go stdlib will dup3 it to another fd, or dup3 a other fd to this + // fd, then it will cause the fd type cmd.Path refers to a random path, + // and it can lead to an error "permission denied" when starting the process. + // Please see #4294. + // So we should not use the original fd of safeExe, but use the fd after + // shuffled by Go stdlib. Because Go stdlib will guarantee this fd refers to + // the correct file. + exePath = "/proc/self/fd/" + strconv.Itoa(stdioFdCount+len(p.ExtraFiles)-1) + p.clonedExes = append(p.clonedExes, safeExe) logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests } @@ -618,18 +644,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { ) } - if safeExe != nil { - // Due to a Go stdlib bug, we need to add safeExe to the set of - // ExtraFiles otherwise it is possible for the stdlib to clobber the fd - // during forkAndExecInChild1 and replace it with some other file that - // might be malicious. This is less than ideal (because the descriptor - // will be non-O_CLOEXEC) however we have protections in "runc init" to - // stop us from leaking extra file descriptors. - // - // See . - cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe) - } - // NOTE: when running a container with no PID namespace and the parent // process spawning the container is PID1 the pdeathsig is being // delivered to the container's init process by the kernel for some