Skip to content

Commit

Permalink
Pass the NVIDIA_DRIVER_CAPABILITIES env var to nvidia-container-cli.
Browse files Browse the repository at this point in the history
runsc attempts to emulate nvidia-container-runtime-hook. But it was always
passing "--compute --utility" as driver capability flags to
`nvidia-container-cli configure` command.

Fix runsc to emulate nvidia-container-runtime-hook correctly by parsing
NVIDIA_DRIVER_CAPABILITIES and converting that comma-separated list to flags.

This is in preparation for adding support for non-compute GPU workloads in
nvproxy :)

Updates #9452
Updates #10856

PiperOrigin-RevId: 671644915
  • Loading branch information
ayushr2 authored and gvisor-bot committed Sep 9, 2024
1 parent 905d769 commit ef4bf25
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 8 deletions.
14 changes: 14 additions & 0 deletions runsc/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ type Config struct {
// the latest supported NVIDIA driver ABI.
NVProxyDriverVersion string `flag:"nvproxy-driver-version"`

// NVProxyAllowUnsupportedCapabilities is a comma-separated list of driver
// capabilities that are allowed to be requested by the container.
NVProxyAllowedDriverCapabilities string `flag:"nvproxy-allowed-driver-capabilities"`

// TPUProxy enables support for TPUs.
TPUProxy bool `flag:"tpuproxy"`

Expand Down Expand Up @@ -408,6 +412,16 @@ func (c *Config) validate() error {
if len(c.ProfilingMetrics) > 0 && len(c.ProfilingMetricsLog) == 0 {
return fmt.Errorf("profiling-metrics flag requires defining a profiling-metrics-log for output")
}
if c.NVProxyAllowedDriverCapabilities == "all" {
return fmt.Errorf("nvproxy-allowed-driver-capabilities cannot be set to 'all', please set it to the exact capabilities you want to allow")
}
for _, cap := range strings.Split(c.NVProxyAllowedDriverCapabilities, ",") {
switch cap {
case "compute", "compat32", "graphics", "utility", "video", "display", "ngx":
default:
return fmt.Errorf("nvproxy-allowed-driver-capabilities contains invalid capability %q", cap)
}
}
return nil
}

Expand Down
1 change: 1 addition & 0 deletions runsc/config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ func RegisterFlags(flagSet *flag.FlagSet) {
flagSet.Bool("nvproxy", false, "EXPERIMENTAL: enable support for Nvidia GPUs")
flagSet.Bool("nvproxy-docker", false, "DEPRECATED: use nvidia-container-runtime or `docker run --gpus` directly. Or manually add nvidia-container-runtime-hook as a prestart hook and set up NVIDIA_VISIBLE_DEVICES container environment variable.")
flagSet.String("nvproxy-driver-version", "", "NVIDIA driver ABI version to use. If empty, autodetect installed driver version. The special value 'latest' may also be used to use the latest ABI.")
flagSet.String("nvproxy-allowed-driver-capabilities", "utility,compute", "Comma separated list of NVIDIA driver capabilities that are allowed to be requested by the container.")
flagSet.Bool("tpuproxy", false, "EXPERIMENTAL: enable support for TPU device passthrough.")

// Test flags, not to be used outside tests, ever.
Expand Down
14 changes: 11 additions & 3 deletions runsc/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -2011,12 +2011,20 @@ func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCm
"configure",
fmt.Sprintf("--ldconfig=@%s", ldconfigPath),
"--no-cgroups", // runsc doesn't configure device cgroups yet
"--utility",
"--compute",
fmt.Sprintf("--pid=%d", goferCmd.Process.Pid),
fmt.Sprintf("--device=%s", devices),
spec.Root.Path,
}
// Pass driver capabilities specified via NVIDIA_DRIVER_CAPABILITIES as flags. See
// nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/main.go:doPrestart().
driverCaps, err := specutils.NvproxyDriverCapsFromEnv(spec, conf)
if err != nil {
return fmt.Errorf("failed to get driver capabilities: %w", err)
}
for cap := range driverCaps {
argv = append(argv, cap.ToFlag())
}
// Add rootfs path as the final argument.
argv = append(argv, spec.Root.Path)
log.Debugf("Executing %q", argv)
var stdout, stderr strings.Builder
cmd := exec.Cmd{
Expand Down
104 changes: 99 additions & 5 deletions runsc/specutils/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,73 @@ import (
"gvisor.dev/gvisor/runsc/config"
)

const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES"
const (
nvidiaVisibleDevsEnv = "NVIDIA_VISIBLE_DEVICES"
nvidiaDriverCapsEnv = "NVIDIA_DRIVER_CAPABILITIES"
cudaVersionEnv = "CUDA_VERSION"
requireCudaEnv = "NVIDIA_REQUIRE_CUDA"
// AnnotationNVProxy enables nvproxy.
AnnotationNVProxy = "dev.gvisor.internal.nvproxy"
)

// NvidiaDriverCap is a GPU driver capability (like compute, graphics, etc.).
type NvidiaDriverCap string

const (
computeCap NvidiaDriverCap = "compute"
utilityCap NvidiaDriverCap = "utility"
// allCap is a special value that means all supported driver capabilities.
allCap NvidiaDriverCap = "all"
)

// AnnotationNVProxy enables nvproxy.
const AnnotationNVProxy = "dev.gvisor.internal.nvproxy"
// ToFlag converts the driver capability to a flag for nvidia-container-cli.
// See nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/capabilities.go:capabilityToCLI().
func (c NvidiaDriverCap) ToFlag() string {
return "--" + string(c)
}

// NvidiaDriverCaps is a set of GPU driver capabilities.
type NvidiaDriverCaps map[NvidiaDriverCap]struct{}

// See nvidia-container-toolkit/internal/config/image/capabilities.go:DefaultDriverCapabilities.
var nvproxyDefaultDriverCaps = NvidiaDriverCaps{
computeCap: struct{}{},
utilityCap: struct{}{},
}

func nvidiaDriverCapsFromString(caps string) NvidiaDriverCaps {
res := make(NvidiaDriverCaps)
for _, cap := range strings.Split(caps, ",") {
trimmed := strings.TrimSpace(cap)
if len(trimmed) == 0 {
continue
}
res[NvidiaDriverCap(trimmed)] = struct{}{}
}
return res
}

func (c NvidiaDriverCaps) hasAll() bool {
_, ok := c[allCap]
return ok
}

// Intersect returns the intersection of two sets of driver capabilities.
func (c NvidiaDriverCaps) Intersect(c2 NvidiaDriverCaps) NvidiaDriverCaps {
if c2.hasAll() {
return c
}
if c.hasAll() {
return c2
}
res := make(NvidiaDriverCaps)
for cap := range c2 {
if _, ok := c[cap]; ok {
res[cap] = struct{}{}
}
}
return res
}

// NVProxyEnabled checks both the nvproxy annotation and conf.NVProxy to see if nvproxy is enabled.
func NVProxyEnabled(spec *specs.Spec, conf *config.Config) bool {
Expand Down Expand Up @@ -78,7 +141,7 @@ func gpuFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) boo
if spec.Process == nil {
return false
}
nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
nvd, _ := EnvVar(spec.Process.Env, nvidiaVisibleDevsEnv)
// A value of "none" means "no GPU device, but still access to driver
// functionality", so it is not a value we check for here.
return nvd != "" && nvd != "void"
Expand All @@ -105,7 +168,7 @@ func isNvidiaHookPresent(spec *specs.Spec, conf *config.Config) bool {
//
// Precondition: conf.NVProxyDocker && GPUFunctionalityRequested(spec, conf).
func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) {
nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
nvd, _ := EnvVar(spec.Process.Env, nvidiaVisibleDevsEnv)
if nvd == "none" {
return "", nil
}
Expand All @@ -130,3 +193,34 @@ func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) {
}
return nvd, nil
}

// NvproxyDriverCapsFromEnv returns the driver capabilities requested by the
// application via the NVIDIA_DRIVER_CAPABILITIES env var. See
// nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/container_config.go:getDriverCapabilities().
func NvproxyDriverCapsFromEnv(spec *specs.Spec, conf *config.Config) (NvidiaDriverCaps, error) {
allowedDriverCaps := nvidiaDriverCapsFromString(conf.NVProxyAllowedDriverCapabilities)
driverCapsEnvStr, ok := EnvVar(spec.Process.Env, nvidiaDriverCapsEnv)
if !ok {
if IsLegacyCudaImage(spec) {
return allowedDriverCaps, nil
}
return nvproxyDefaultDriverCaps, nil
}
if len(driverCapsEnvStr) == 0 {
return nvproxyDefaultDriverCaps, nil
}
envDriverCaps := nvidiaDriverCapsFromString(driverCapsEnvStr)
driverCaps := allowedDriverCaps.Intersect(envDriverCaps)
if !envDriverCaps.hasAll() && len(driverCaps) != len(envDriverCaps) {
return nil, fmt.Errorf("disallowed driver capabilities requested: '%v' (allowed '%v'), update --nvproxy-allowed-driver-capabilities to allow them", envDriverCaps, driverCaps)
}
return driverCaps, nil
}

// IsLegacyCudaImage returns true if spec represents a legacy CUDA image.
// See nvidia-container-toolkit/internal/config/image/cuda_image.go:IsLegacy().
func IsLegacyCudaImage(spec *specs.Spec) bool {
cudaVersion, _ := EnvVar(spec.Process.Env, cudaVersionEnv)
requireCuda, _ := EnvVar(spec.Process.Env, requireCudaEnv)
return len(cudaVersion) > 0 && len(requireCuda) == 0
}

0 comments on commit ef4bf25

Please sign in to comment.