Skip to content

Commit

Permalink
Pass the NVIDIA_DRIVER_CAPABILITIES env var to nvidia-container-cli.
Browse files Browse the repository at this point in the history
runsc attempts to emulate nvidia-container-runtime-hook. But it was always
passing "--compute --utility" as driver capability flags to
`nvidia-container-cli configure` command.

Fix runsc to emulate nvidia-container-runtime-hook correctly by parsing
NVIDIA_DRIVER_CAPABILITIES and converting that comma-separated list to flags.

This is in preparation for adding support for non-compute GPU workloads in
nvproxy :)

Updates #9452
Updates #10856

PiperOrigin-RevId: 671644915
  • Loading branch information
ayushr2 authored and gvisor-bot committed Sep 6, 2024
1 parent 3c4b246 commit d8574d4
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 8 deletions.
4 changes: 4 additions & 0 deletions runsc/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ type Config struct {
// the latest supported NVIDIA driver ABI.
NVProxyDriverVersion string `flag:"nvproxy-driver-version"`

// NVProxyAllowUnsupportedCapabilities is a comma-separated list of driver
// capabilities that are allowed to be requested by the container.
NVProxyAllowedDriverCapabilities string `flag:"nvproxy-allowed-driver-capabilities"`

// TPUProxy enables support for TPUs.
TPUProxy bool `flag:"tpuproxy"`

Expand Down
1 change: 1 addition & 0 deletions runsc/config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ func RegisterFlags(flagSet *flag.FlagSet) {
flagSet.Bool("nvproxy", false, "EXPERIMENTAL: enable support for Nvidia GPUs")
flagSet.Bool("nvproxy-docker", false, "DEPRECATED: use nvidia-container-runtime or `docker run --gpus` directly. Or manually add nvidia-container-runtime-hook as a prestart hook and set up NVIDIA_VISIBLE_DEVICES container environment variable.")
flagSet.String("nvproxy-driver-version", "", "NVIDIA driver ABI version to use. If empty, autodetect installed driver version. The special value 'latest' may also be used to use the latest ABI.")
flagSet.String("nvproxy-allowed-driver-capabilities", "utility,compute", "Comma separated list of NVIDIA driver capabilities that are allowed to be requested by the container.")
flagSet.Bool("tpuproxy", false, "EXPERIMENTAL: enable support for TPU device passthrough.")

// Test flags, not to be used outside tests, ever.
Expand Down
19 changes: 16 additions & 3 deletions runsc/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -1936,6 +1936,11 @@ func nvproxyLoadKernelModules() {
}
}

// See nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/capabilities.go:capabilityToCLI().
func nvproxyDriverCapToFlag(cap specutils.NvidiaDriverCap) string {
return "--" + string(cap)
}

// nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`.
// This sets up the container filesystem with bind mounts that allow it to
// use NVIDIA devices.
Expand Down Expand Up @@ -2011,12 +2016,20 @@ func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCm
"configure",
fmt.Sprintf("--ldconfig=@%s", ldconfigPath),
"--no-cgroups", // runsc doesn't configure device cgroups yet
"--utility",
"--compute",
fmt.Sprintf("--pid=%d", goferCmd.Process.Pid),
fmt.Sprintf("--device=%s", devices),
spec.Root.Path,
}
// Pass driver capabilities specified via NVIDIA_DRIVER_CAPABILITIES as flags. See
// nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/main.go:doPrestart().
driverCaps, err := specutils.NvproxyDriverCapsFromEnv(spec, conf)
if err != nil {
return fmt.Errorf("failed to get driver capabilities: %w", err)
}
for cap := range driverCaps {
argv = append(argv, nvproxyDriverCapToFlag(cap))
}
// Add rootfs path as the final argument.
argv = append(argv, spec.Root.Path)
log.Debugf("Executing %q", argv)
var stdout, stderr strings.Builder
cmd := exec.Cmd{
Expand Down
98 changes: 93 additions & 5 deletions runsc/specutils/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,67 @@ import (
"gvisor.dev/gvisor/runsc/config"
)

const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES"
const (
nvidiaVisibleDevsEnv = "NVIDIA_VISIBLE_DEVICES"
nvdiaDriverCapsEnv = "NVIDIA_DRIVER_CAPABILITIES"
cudaVersionEnv = "CUDA_VERSION"
requireCudaEnv = "NVIDIA_REQUIRE_CUDA"
// AnnotationNVProxy enables nvproxy.
AnnotationNVProxy = "dev.gvisor.internal.nvproxy"
)

// NvidiaDriverCap is a GPU driver capability (like compute, graphics, etc.).
type NvidiaDriverCap string

const (
computeCap NvidiaDriverCap = "compute"
utilityCap NvidiaDriverCap = "utility"
// allCap is a special value that means all supported driver capabilities.
allCap NvidiaDriverCap = "all"
)

// NvidiaDriverCaps is a set of GPU driver capabilities.
type NvidiaDriverCaps map[NvidiaDriverCap]struct{}

// See nvidia-container-toolkit/internal/config/image/capabilities.go:DefaultDriverCapabilities.
var nvproxyDefaultDriverCaps = NvidiaDriverCaps{
computeCap: struct{}{},
utilityCap: struct{}{},
}

// AnnotationNVProxy enables nvproxy.
const AnnotationNVProxy = "dev.gvisor.internal.nvproxy"
func nvidiaDriverCapsFromString(caps string) NvidiaDriverCaps {
res := make(NvidiaDriverCaps)
for _, cap := range strings.Split(caps, ",") {
trimmed := strings.TrimSpace(cap)
if len(trimmed) == 0 {
continue
}
res[NvidiaDriverCap(trimmed)] = struct{}{}
}
return res
}

func (c NvidiaDriverCaps) hasAll() bool {
_, ok := c[allCap]
return ok
}

// Intersect returns the intersection of two sets of driver capabilities.
func (c NvidiaDriverCaps) Intersect(c2 NvidiaDriverCaps) NvidiaDriverCaps {
if c2.hasAll() {
return c
}
if c.hasAll() {
return c2
}
res := make(NvidiaDriverCaps)
for cap := range c2 {
if _, ok := c[cap]; ok {
res[cap] = struct{}{}
}
}
return res
}

// NVProxyEnabled checks both the nvproxy annotation and conf.NVProxy to see if nvproxy is enabled.
func NVProxyEnabled(spec *specs.Spec, conf *config.Config) bool {
Expand Down Expand Up @@ -78,7 +135,7 @@ func gpuFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) boo
if spec.Process == nil {
return false
}
nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
nvd, _ := EnvVar(spec.Process.Env, nvidiaVisibleDevsEnv)
// A value of "none" means "no GPU device, but still access to driver
// functionality", so it is not a value we check for here.
return nvd != "" && nvd != "void"
Expand All @@ -105,7 +162,7 @@ func isNvidiaHookPresent(spec *specs.Spec, conf *config.Config) bool {
//
// Precondition: conf.NVProxyDocker && GPUFunctionalityRequested(spec, conf).
func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) {
nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
nvd, _ := EnvVar(spec.Process.Env, nvidiaVisibleDevsEnv)
if nvd == "none" {
return "", nil
}
Expand All @@ -130,3 +187,34 @@ func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) {
}
return nvd, nil
}

// NvproxyDriverCapsFromEnv returns the driver capabilities requested by the
// application via the NVIDIA_DRIVER_CAPABILITIES env var. See
// nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/container_config.go:getDriverCapabilities().
func NvproxyDriverCapsFromEnv(spec *specs.Spec, conf *config.Config) (NvidiaDriverCaps, error) {
allowedDriverCaps := nvidiaDriverCapsFromString(conf.NVProxyAllowedDriverCapabilities)
driverCapsEnvStr, ok := EnvVar(spec.Process.Env, nvdiaDriverCapsEnv)
if !ok {
if IsLegacyCudaImage(spec) {
return allowedDriverCaps, nil
}
return nvproxyDefaultDriverCaps, nil
}
if len(driverCapsEnvStr) == 0 {
return nvproxyDefaultDriverCaps, nil
}
envDriverCaps := nvidiaDriverCapsFromString(driverCapsEnvStr)
driverCaps := allowedDriverCaps.Intersect(envDriverCaps)
if !envDriverCaps.hasAll() && len(driverCaps) != len(envDriverCaps) {
return nil, fmt.Errorf("disallowed driver capabilities requested: '%v' (allowed '%v'), update --nvproxy-allowed-driver-capabilities to allow them", envDriverCaps, driverCaps)
}
return driverCaps, nil
}

// IsLegacyCudaImage returns true if spec represents a legacy CUDA image.
// See nvidia-container-toolkit/internal/config/image/cuda_image.go:IsLegacy().
func IsLegacyCudaImage(spec *specs.Spec) bool {
cudaVersion, _ := EnvVar(spec.Process.Env, cudaVersionEnv)
requireCuda, _ := EnvVar(spec.Process.Env, requireCudaEnv)
return len(cudaVersion) > 0 && len(requireCuda) == 0
}

0 comments on commit d8574d4

Please sign in to comment.