From 9d7a70740586d0ccca9821219de7289b0d3a5c7b Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 27 Nov 2023 15:01:02 +0100 Subject: [PATCH] Create .so symlinks for driver libraries in container This change adds an opt-in feature for creating .so symlinks to all injected driver files in a contianer. If features.dot-so-symlinks = true is set in the config.toml, the creation of symlinks for driver files is enabled. This can also be triggered on a per-container basis using the envvar NVIDIA_DOT_SO_SYMLINKS=enabled. Signed-off-by: Evan Lezar --- CHANGELOG.md | 1 + .../create-dot-so-symlinks.go | 115 ++++++++++++++++++ cmd/nvidia-ctk/hook/hook.go | 2 + internal/config/features.go | 23 ++-- internal/discover/dot_so_symlinks.go | 27 ++++ internal/discover/graphics.go | 15 +-- internal/lookup/root/options.go | 6 + internal/lookup/root/root.go | 64 ++++++++++ internal/modifier/gated.go | 20 ++- internal/runtime/runtime_factory.go | 2 +- internal/runtime/runtime_factory_test.go | 1 + 11 files changed, 252 insertions(+), 24 deletions(-) create mode 100644 cmd/nvidia-ctk/hook/create-dot-so-symlinks/create-dot-so-symlinks.go create mode 100644 internal/discover/dot_so_symlinks.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b08deca3..97cd9c473 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## v1.15.0 +* Add a hook to create `.so` symlinks for driver libraries in a container. * Remove `nvidia-container-runtime` and `nvidia-docker2` packages. * Use `XDG_DATA_DIRS` environment variable when locating config files such as graphics config files. * Add support for v0.7.0 Container Device Interface (CDI) specification. diff --git a/cmd/nvidia-ctk/hook/create-dot-so-symlinks/create-dot-so-symlinks.go b/cmd/nvidia-ctk/hook/create-dot-so-symlinks/create-dot-so-symlinks.go new file mode 100644 index 000000000..bba353199 --- /dev/null +++ b/cmd/nvidia-ctk/hook/create-dot-so-symlinks/create-dot-so-symlinks.go @@ -0,0 +1,115 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dotsosymlinks + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/urfave/cli/v2" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" +) + +type command struct { + logger logger.Interface +} + +type config struct { + containerSpec string + driverVersion string +} + +// NewCommand constructs a hook command with the specified logger +func NewCommand(logger logger.Interface) *cli.Command { + c := command{ + logger: logger, + } + return c.build() +} + +// build +func (m command) build() *cli.Command { + cfg := config{} + + // Create the '' command + c := cli.Command{ + Name: "create-dot-so-symlinks", + Usage: "A hook to create .so symlinks in the container.", + Action: func(c *cli.Context) error { + return m.run(c, &cfg) + }, + } + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "container-spec", + Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN", + Destination: &cfg.containerSpec, + }, + &cli.StringFlag{ + Name: "driver-version", + Usage: "specify the driver version for which the symlinks are to be created. This assumes driver libraries have the .so.`VERSION` suffix.", + Destination: &cfg.driverVersion, + Required: true, + }, + } + + return &c +} + +func (m command) run(c *cli.Context, cfg *config) error { + s, err := oci.LoadContainerState(cfg.containerSpec) + if err != nil { + return fmt.Errorf("failed to load container state: %v", err) + } + + containerRoot, err := s.GetContainerRoot() + if err != nil { + return fmt.Errorf("failed to determined container root: %v", err) + } + + locator := lookup.NewLibraryLocator( + lookup.WithLogger(m.logger), + lookup.WithRoot(containerRoot), + lookup.WithOptional(true), + ) + libs, err := locator.Locate("*.so." + cfg.driverVersion) + if err != nil { + return fmt.Errorf("failed to locate libraries for driver version %v: %v", cfg.driverVersion, err) + } + + for _, lib := range libs { + if !strings.HasSuffix(lib, ".so."+cfg.driverVersion) { + continue + } + libSoPath := strings.TrimSuffix(lib, "."+cfg.driverVersion) + libSoXPaths, err := filepath.Glob(libSoPath + ".[0-9]") + if len(libSoXPaths) != 1 || err != nil { + continue + } + err = os.Symlink(filepath.Base(libSoXPaths[0]), libSoPath) + if err != nil { + continue + } + } + return nil +} diff --git a/cmd/nvidia-ctk/hook/hook.go b/cmd/nvidia-ctk/hook/hook.go index 54088988a..77ca53ad5 100644 --- a/cmd/nvidia-ctk/hook/hook.go +++ b/cmd/nvidia-ctk/hook/hook.go @@ -22,6 +22,7 @@ import ( "github.com/urfave/cli/v2" + createdotsosymlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/create-dot-so-symlinks" symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/create-symlinks" ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/update-ldcache" ) @@ -50,6 +51,7 @@ func (m hookCommand) build() *cli.Command { ldcache.NewCommand(m.logger), symlinks.NewCommand(m.logger), chmod.NewCommand(m.logger), + createdotsosymlinks.NewCommand(m.logger), } return &hook diff --git a/internal/config/features.go b/internal/config/features.go index dfc6b165e..0190cd8df 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -19,10 +19,11 @@ package config type featureName string const ( - FeatureGDS = featureName("gds") - FeatureMOFED = featureName("mofed") - FeatureNVSWITCH = featureName("nvswitch") - FeatureGDRCopy = featureName("gdrcopy") + FeatureGDS = featureName("gds") + FeatureMOFED = featureName("mofed") + FeatureNVSWITCH = featureName("nvswitch") + FeatureGDRCopy = featureName("gdrcopy") + FeatureDotSoSymlinks = featureName("dot-so-symlinks") ) // features specifies a set of named features. @@ -31,6 +32,9 @@ type features struct { MOFED *feature `toml:"mofed,omitempty"` NVSWITCH *feature `toml:"nvswitch,omitempty"` GDRCopy *feature `toml:"gdrcopy,omitempty"` + // DotSoSymlinks allows for the creation of .so symlinks to .so.1 driver + // files to be opted in to. + DotSoSymlinks *feature `toml:"dot-so-symlinks,omitempty"` } type feature bool @@ -40,10 +44,11 @@ type feature bool // variables can also be supplied. func (fs features) IsEnabled(n featureName, in ...getenver) bool { featureEnvvars := map[featureName]string{ - FeatureGDS: "NVIDIA_GDS", - FeatureMOFED: "NVIDIA_MOFED", - FeatureNVSWITCH: "NVIDIA_NVSWITCH", - FeatureGDRCopy: "NVIDIA_GDRCOPY", + FeatureGDS: "NVIDIA_GDS", + FeatureMOFED: "NVIDIA_MOFED", + FeatureNVSWITCH: "NVIDIA_NVSWITCH", + FeatureGDRCopy: "NVIDIA_GDRCOPY", + FeatureDotSoSymlinks: "NVIDIA_DOT_SO_SYMLINKS", } envvar := featureEnvvars[n] @@ -56,6 +61,8 @@ func (fs features) IsEnabled(n featureName, in ...getenver) bool { return fs.NVSWITCH.isEnabled(envvar, in...) case FeatureGDRCopy: return fs.GDRCopy.isEnabled(envvar, in...) + case FeatureDotSoSymlinks: + return fs.DotSoSymlinks.isEnabled(envvar, in...) default: return false } diff --git a/internal/discover/dot_so_symlinks.go b/internal/discover/dot_so_symlinks.go new file mode 100644 index 000000000..3b6ab76e0 --- /dev/null +++ b/internal/discover/dot_so_symlinks.go @@ -0,0 +1,27 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package discover + +// NewDotSoSymlinksDiscoverer creates a discoverer that generates a hook to create .so symlinks in +// a container. +func NewDotSoSymlinksDiscoverer(nvidiaCTKPath string, version string) Discover { + return CreateNvidiaCTKHook( + nvidiaCTKPath, + "create-dot-so-symlinks", + "--driver-version", version, + ) +} diff --git a/internal/discover/graphics.go b/internal/discover/graphics.go index 35b51982b..13cf65508 100644 --- a/internal/discover/graphics.go +++ b/internal/discover/graphics.go @@ -27,7 +27,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" ) @@ -252,20 +251,16 @@ func optionalXorgDiscoverer(logger logger.Interface, driver *root.Driver, nvidia } func newXorgDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCTKPath string) (Discover, error) { - libCudaPaths, err := cuda.New( - driver.Libraries(), - ).Locate(".*.*") + libRoot, err := driver.LibraryRoot() if err != nil { - return nil, fmt.Errorf("failed to locate libcuda.so: %v", err) + return nil, fmt.Errorf("failed to determine driver library root: %w", err) } - libcudaPath := libCudaPaths[0] - version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") - if version == "" { - return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) + version, err := driver.Version() + if err != nil { + return nil, fmt.Errorf("failed to determine driver version: %w", err) } - libRoot := filepath.Dir(libcudaPath) xorgLibs := NewMounts( logger, lookup.NewFileLocator( diff --git a/internal/lookup/root/options.go b/internal/lookup/root/options.go index 6bffe3d8a..f46412b6c 100644 --- a/internal/lookup/root/options.go +++ b/internal/lookup/root/options.go @@ -43,3 +43,9 @@ func WithConfigSearchPaths(paths ...string) Option { d.configSearchPaths = paths } } + +func WithVersion(version string) Option { + return func(d *Driver) { + d.version = version + } +} diff --git a/internal/lookup/root/root.go b/internal/lookup/root/root.go index 4a475ccd4..d8df68eb7 100644 --- a/internal/lookup/root/root.go +++ b/internal/lookup/root/root.go @@ -17,8 +17,11 @@ package root import ( + "fmt" "os" "path/filepath" + "strings" + "sync" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" @@ -26,6 +29,7 @@ import ( // Driver represents a filesystem in which a set of drivers or devices is defined. type Driver struct { + sync.Mutex logger logger.Interface // Root represents the root from the perspective of the driver libraries and binaries. Root string @@ -33,6 +37,10 @@ type Driver struct { librarySearchPaths []string // configSearchPaths specified explicit search paths for discovering driver config files. configSearchPaths []string + // version stores the driver version. This can be specified at construction or cached on subsequent calls. + version string + // libraryRoot stores the absolute path where the driver libraries (libcuda.so.) can be found. + libraryRoot string } // New creates a new Driver root using the specified options. @@ -80,6 +88,62 @@ func (r *Driver) configSearchOptions() []lookup.Option { } } +// Version returns the driver version as a string. +func (r *Driver) Version() (string, error) { + r.Lock() + defer r.Unlock() + if r.version != "" { + return r.version, nil + } + + libcudaPath, err := r.libcudaPath() + if err != nil { + return "", fmt.Errorf("failed to locate libcuda.so: %v", err) + } + + version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") + if version == "" { + return "", fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) + } + + r.version = version + return r.version, nil +} + +// LibraryRoot returns the folder in which the driver libraries can be found. +func (r *Driver) LibraryRoot() (string, error) { + r.Lock() + defer r.Unlock() + if r.libraryRoot != "" { + return r.libraryRoot, nil + } + + libcudaPath, err := r.libcudaPath() + if err != nil { + return "", fmt.Errorf("failed to locate libcuda.so: %v", err) + } + + r.libraryRoot = filepath.Dir(libcudaPath) + return r.libraryRoot, nil +} + +// libcudaPath returns the path to libcuda.so.*.* in the driver root. +func (r *Driver) libcudaPath() (string, error) { + pattern := "libcuda.so.*.*" + + locator := r.Libraries() + paths, err := locator.Locate(pattern) + if err != nil { + return "", fmt.Errorf("failed to locate %v: %v", pattern, err) + } + + libcudaPath := paths[0] + if len(paths) > 1 { + r.logger.Warningf("Selecting %v out of multiple libcuda.so paths.", libcudaPath, paths) + } + return libcudaPath, nil +} + // normalizeSearchPaths takes a list of paths and normalized these. // Each of the elements in the list is expanded if it is a path list and the // resultant list is returned. diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 5bed3eaf2..13c068d8a 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -23,31 +23,31 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" ) // NewFeatureGatedModifier creates the modifiers for optional features. // These include: // +// NVIDIA_DOT_SO_SYMLINKS=enabled // NVIDIA_GDS=enabled // NVIDIA_MOFED=enabled // NVIDIA_NVSWITCH=enabled // NVIDIA_GDRCOPY=enabled // // If not devices are selected, no changes are made. -func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) { +func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) { if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } - var discoverers []discover.Discover - - driverRoot := cfg.NVIDIAContainerCLIConfig.Root devRoot := cfg.NVIDIAContainerCLIConfig.Root + var discoverers []discover.Discover if cfg.Features.IsEnabled(config.FeatureGDS, image) { - d, err := discover.NewGDSDiscoverer(logger, driverRoot, devRoot) + d, err := discover.NewGDSDiscoverer(logger, driver.Root, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for GDS devices: %w", err) } @@ -78,5 +78,15 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } + if cfg.Features.IsEnabled(config.FeatureDotSoSymlinks, image) { + version, err := driver.Version() + if err != nil { + return nil, fmt.Errorf("failed to get driver version required for .so symlinks: %w", err) + } + + d := discover.NewDotSoSymlinksDiscoverer(cfg.NVIDIACTKConfig.Path, version) + discoverers = append(discoverers, d) + } + return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) } diff --git a/internal/runtime/runtime_factory.go b/internal/runtime/runtime_factory.go index 5bd7983a0..ccd1aa047 100644 --- a/internal/runtime/runtime_factory.go +++ b/internal/runtime/runtime_factory.go @@ -88,7 +88,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp return nil, err } - featureModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image) + featureModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image, driver) if err != nil { return nil, err } diff --git a/internal/runtime/runtime_factory_test.go b/internal/runtime/runtime_factory_test.go index dd052d8fe..bd34a17c3 100644 --- a/internal/runtime/runtime_factory_test.go +++ b/internal/runtime/runtime_factory_test.go @@ -66,6 +66,7 @@ func TestFactoryMethod(t *testing.T) { logger, _ := testlog.NewNullLogger() driver := root.New( root.WithDriverRoot("/nvidia/driver/root"), + root.WithVersion("999.88.77"), ) testCases := []struct {