Skip to content

Commit

Permalink
On COS, add NVIDIA library directory to LD configuration and update c…
Browse files Browse the repository at this point in the history
…ache.

Unlike Ubuntu VMs where we use Docker's `--gpus` flag, COS VMs do not use
this flag and instead mount the NVIDIA library directories automatically.
However, nothing guarantees that these directories are added to the LD
config. This change fixes that. It take advantage of the fact that all GPU
tests have the sniffer binary as entrypoint, which slightly overloads the
role of the sniffer within the GPU test infrastructure... but then again
the ioctl sniffer is already deeply intertwined with ld configuration
because it already overrides the `ioctl` libc function, so this doesn't
seem like too big of a stretch.

This change makes the ffmpeg test succeed with `runc` on COS, but they still
fail with gVisor (with `CUDA_ERROR_OUT_OF_MEMORY` errors). So there must be
some further gVisor-specific error.

Updates #11351
Updates #11321

PiperOrigin-RevId: 715106144
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Jan 14, 2025
1 parent 4ba931d commit 3d6b429
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 3 deletions.
10 changes: 10 additions & 0 deletions pkg/test/dockerutil/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ func GPURunOpts(sniffGPUOpts SniffGPUOpts) (RunOpts, error) {
Type: mount.TypeBind,
ReadOnly: true,
})
break
}
}
for _, nvidiaLib64 := range []string{
Expand All @@ -139,6 +140,8 @@ func GPURunOpts(sniffGPUOpts SniffGPUOpts) (RunOpts, error) {
Type: mount.TypeBind,
ReadOnly: true,
})
sniffGPUOpts.addLDPath = "/usr/local/nvidia/lib64"
break
}
}

Expand Down Expand Up @@ -166,6 +169,10 @@ type SniffGPUOpts struct {
// If unset, defaults to `DefaultGPUCapabilities`.
Capabilities string

// If set, add the given directory to the ld cache.
// Must be a directory visible from within the container.
addLDPath string

// The fields below are set internally.
runSniffer *os.File
}
Expand All @@ -191,6 +198,9 @@ func (sgo *SniffGPUOpts) prepend(argv []string) []string {
if !sgo.AllowIncompatibleIoctl {
snifferArgv = append(snifferArgv, "--enforce_compatibility=INSTANT")
}
if sgo.addLDPath != "" {
snifferArgv = append(snifferArgv, fmt.Sprintf("--add_ld_path=%s", sgo.addLDPath))
}
return append(snifferArgv, argv...)
}

Expand Down
5 changes: 4 additions & 1 deletion tools/ioctl_sniffer/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ cc_binary(

go_binary(
name = "run_sniffer",
srcs = ["run_sniffer.go"],
srcs = [
"ld.go",
"run_sniffer.go",
],
embedsrcs = [
# The 'keep' comment is needed to prevent glaze from removing this
# dependency. This is because the `:ioctl_hook` `cc_binary` rule
Expand Down
38 changes: 38 additions & 0 deletions tools/ioctl_sniffer/ld.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright 2025 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"context"
"fmt"
"os"
"os/exec"
)

// addPathToLd adds the given path to the ld cache.
func addPathToLd(ctx context.Context, path string) error {
const myLdConfigPath = "/etc/ld.so.conf.d/gvisor.conf"
if err := os.WriteFile(myLdConfigPath, []byte(fmt.Sprintf("# Generated by gVisor ioctl sniffer\n%s", path)), 0644); err != nil {
return fmt.Errorf("failed to write to ld config file %q: %w", myLdConfigPath, err)
}
if err := os.Remove("/etc/ld.so.cache"); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to remove ld cache file: %w", err)
}
output, err := exec.CommandContext(ctx, "ldconfig").CombinedOutput()
if err != nil {
return fmt.Errorf("failed to run ldconfig: %w; output: %s", err, string(output))
}
return nil
}
13 changes: 11 additions & 2 deletions tools/ioctl_sniffer/run_sniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,11 @@ import (
_ "embed" // Necessary to use go:embed.
)

var enforceCompatibility = flag.String("enforce_compatibility", "", "May be set to 'INSTANT' or 'REPORT'. If set, the sniffer will return a non-zero error code if it detects an unsupported ioctl. 'INSTANT' causes the sniffer to exit immediately when this happens. 'REPORT' causes the sniffer to report all unsupported ioctls at the end of execution.")
var verbose = flag.Bool("verbose", false, "If true, the sniffer will print all Nvidia ioctls it sees.")
var (
enforceCompatibility = flag.String("enforce_compatibility", "", "May be set to 'INSTANT' or 'REPORT'. If set, the sniffer will return a non-zero error code if it detects an unsupported ioctl. 'INSTANT' causes the sniffer to exit immediately when this happens. 'REPORT' causes the sniffer to report all unsupported ioctls at the end of execution.")
verbose = flag.Bool("verbose", false, "If true, the sniffer will print all Nvidia ioctls it sees.")
addLdPath = flag.String("add_ld_path", "", "If set, reconfigure the ld cache to include the given directory")
)

//go:embed libioctl_hook.so
var ioctlHookSharedObject []byte
Expand Down Expand Up @@ -68,6 +71,12 @@ func Main(ctx context.Context) error {
log.SetLevel(log.Debug)
}

if *addLdPath != "" {
if err := addPathToLd(ctx, *addLdPath); err != nil {
return fmt.Errorf("failed to add path %q to ld: %w", *addLdPath, err)
}
}

// Init our sniffer
if err := sniffer.Init(); err != nil {
return fmt.Errorf("failed to init sniffer: %w", err)
Expand Down

0 comments on commit 3d6b429

Please sign in to comment.