Skip to content

Commit

Permalink
Integrate GPU ioctl sniffer in GPU tests.
Browse files Browse the repository at this point in the history
This wraps all GPU tests' command line with the nvproxy ioctl sniffer.

This has multiple functions:

- Verifying that the application does not call ioctls unsupported by
  nvproxy. This is controlled by a `AllowIncompatibleIoctl` option, which
  is initially set to `true` in all tests to mirror current behavior, but
  should be flipped as we verify that they do not call unsupported ioctls.
- Verifying that the sniffer itself works transparently for a wide range
  of applications.
- Later down the line, enforcing that the application only calls ioctls
  that are part of GPU capabilities that it has a need for. This is
  controlled by a capability string which is currently only used to set
  the `NVIDIA_DRIVER_CAPABILITIES` environment variable.

Updates issue #10856

PiperOrigin-RevId: 670751227
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Sep 4, 2024
1 parent 932d9dc commit 3363c61
Show file tree
Hide file tree
Showing 15 changed files with 201 additions and 97 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ dev: $(RUNTIME_BIN) ## Installs a set of local runtimes. Requires sudo.
@$(call configure_noreload,$(RUNTIME)-p,--net-raw --profile)
@$(call configure_noreload,$(RUNTIME)-cgroup-d,--net-raw --debug --strace --log-packets --cgroupfs)
@$(call configure_noreload,$(RUNTIME)-systemd-d,--net-raw --debug --strace --log-packets --systemd-cgroup)
@$(call configure_noreload,$(RUNTIME)-gpu,--nvproxy)
@$(call reload_docker)
.PHONY: dev

Expand Down
14 changes: 14 additions & 0 deletions pkg/test/dockerutil/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ package(
licenses = ["notice"],
)

# We copy the `run_sniffer` binary here because `go:embed` can only embed
# from the current directory or subdirectories, not parents of it.
genrule(
name = "run_sniffer_bin",
srcs = [
"//tools/ioctl_sniffer:run_sniffer",
],
outs = ["run_sniffer_copy"],
cmd = "cat < $(SRCS) > $@",
)

go_library(
name = "dockerutil",
testonly = 1,
Expand All @@ -16,6 +27,9 @@ go_library(
"network.go",
"profile.go",
],
embedsrcs = [
":run_sniffer_bin", # keep
],
visibility = ["//:sandbox"],
deps = [
"//pkg/sync",
Expand Down
14 changes: 13 additions & 1 deletion pkg/test/dockerutil/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ type RunOpts struct {
DeviceRequests []container.DeviceRequest

Devices []container.DeviceMapping

// sniffGPUOpts, if set, sets the rules for GPU sniffing during this test.
// Must be set via `RunOpts.SniffGPU`.
sniffGPUOpts *SniffGPUOpts
}

func makeContainer(ctx context.Context, logger testutil.Logger, runtime string) *Container {
Expand Down Expand Up @@ -279,10 +283,18 @@ func (c *Container) config(r RunOpts, args []string) *container.Config {
}
env := append(r.Env, fmt.Sprintf("RUNSC_TEST_NAME=%s", c.Name))

entrypoint := r.Entrypoint
if r.sniffGPUOpts != nil {
c.cleanups = append(c.cleanups, func() {
r.sniffGPUOpts.cleanup()
})
entrypoint = r.sniffGPUOpts.prepend(entrypoint)
}

return &container.Config{
Image: testutil.ImageByName(r.Image),
Cmd: args,
Entrypoint: r.Entrypoint,
Entrypoint: entrypoint,
ExposedPorts: ports,
Env: env,
WorkingDir: r.WorkDir,
Expand Down
110 changes: 100 additions & 10 deletions pkg/test/dockerutil/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,77 @@ import (

"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/mount"

// Needed for go:embed
_ "embed"
)

// Flags.
var (
setCOSGPU = flag.Bool("cos-gpu", false, "set to configure GPU settings for COS, as opposed to Docker")
)

// AllGPUCapabilities is the environment variable that enables all NVIDIA GPU
// capabilities within a container.
const AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"
//go:embed run_sniffer_copy
var runSnifferBinary []byte

const (
// ioctlSnifferMountPath is the in-container path at which the ioctl sniffer is mounted.
ioctlSnifferMountPath = "/ioctl_sniffer"
)

const (
// AllGPUCapabilities is the environment variable that enables all NVIDIA
// GPU capabilities within a container.
AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"

// DefaultGPUCapabilities is the environment variable that enables default
// NVIDIA GPU capabilities within a container.
DefaultGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=compute,utility"
)

// GPURunOpts returns Docker run options with GPU support enabled.
func GPURunOpts() RunOpts {
func GPURunOpts(sniffGPUOpts SniffGPUOpts) (RunOpts, error) {
// Extract the sniffer binary to a temporary location.
runSniffer, err := os.CreateTemp("", "run_sniffer.*")
if err != nil {
return RunOpts{}, fmt.Errorf("failed to create temporary file: %w", err)
}
if _, err := runSniffer.Write(runSnifferBinary); err != nil {
return RunOpts{}, fmt.Errorf("failed to write to temporary file: %w", err)
}
if err := runSniffer.Sync(); err != nil {
return RunOpts{}, fmt.Errorf("failed to sync temporary file: %w", err)
}
if err := runSniffer.Chmod(0o555); err != nil {
return RunOpts{}, fmt.Errorf("failed to chmod temporary file: %w", err)
}
if err := runSniffer.Close(); err != nil {
return RunOpts{}, fmt.Errorf("failed to close temporary file: %w", err)
}
sniffGPUOpts.runSniffer = runSniffer
mounts := []mount.Mount{
{
Source: runSniffer.Name(),
Target: ioctlSnifferMountPath,
Type: mount.TypeBind,
ReadOnly: true,
},
}
gpuEnv := []string{sniffGPUOpts.GPUCapabilities()}

if !*setCOSGPU {
return RunOpts{
Env: []string{AllGPUCapabilities},
Env: gpuEnv,
DeviceRequests: []container.DeviceRequest{
{
Count: -1,
Capabilities: [][]string{{"gpu"}},
Options: map[string]string{},
},
},
}
Mounts: mounts,
sniffGPUOpts: &sniffGPUOpts,
}, nil
}

// COS has specific settings since it has a custom installer for GPU drivers.
Expand All @@ -68,7 +115,6 @@ func GPURunOpts() RunOpts {
})
}

var mounts []mount.Mount
for _, nvidiaBin := range []string{
"/home/kubernetes/bin/nvidia/bin",
"/var/lib/nvidia/bin",
Expand Down Expand Up @@ -97,10 +143,54 @@ func GPURunOpts() RunOpts {
}

return RunOpts{
Env: []string{AllGPUCapabilities},
Mounts: mounts,
Devices: devices,
Env: gpuEnv,
Mounts: mounts,
Devices: devices,
sniffGPUOpts: &sniffGPUOpts,
}, nil
}

// SniffGPUOpts dictates options to sniffer GPU workloads.
type SniffGPUOpts struct {
// If true, the test will not fail even when the workload calls incompatible
// ioctls. Useful for debugging.
// TODO(b/340955577): Should be converted to a flag and removed from this
// struct once all GPU tests have no incompatible ioctls.
AllowIncompatibleIoctl bool

// The set of GPU capabilities exposed to the container.
// If unset, defaults to `DefaultGPUCapabilities`.
Capabilities string

// The fields below are set internally.
runSniffer *os.File
}

// GPUCapabilities returns the set of GPU capabilities meant to be
// exposed to the container.
func (sgo *SniffGPUOpts) GPUCapabilities() string {
if sgo.Capabilities == "" {
return DefaultGPUCapabilities
}
return sgo.Capabilities
}

// prepend prepends the sniffer arguments to the given command.
func (sgo *SniffGPUOpts) prepend(argv []string) []string {
snifferArgv := []string{
ioctlSnifferMountPath,
"--verbose=true",
fmt.Sprintf("--enforce_compatibility=%t", !sgo.AllowIncompatibleIoctl),
// TODO(eperot): Add flag to enforce capability set here once implemented.
}
return append(snifferArgv, argv...)
}

func (sgo *SniffGPUOpts) cleanup() error {
if err := os.Remove(sgo.runSniffer.Name()); err != nil {
return fmt.Errorf("failed to unlink temporary file %q: %w", sgo.runSniffer.Name(), err)
}
return nil
}

// NumGPU crudely estimates the number of NVIDIA GPUs on the host.
Expand Down
19 changes: 1 addition & 18 deletions test/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -103,33 +103,16 @@ go_test(
deps = ["//test/gpu/stablediffusion"],
)

# We copy the `run_sniffer` binary here because `go:embed` can only embed
# from the current directory or subdirectories, not parents of it.
genrule(
name = "run_sniffer_copy",
srcs = [
"//tools/ioctl_sniffer:run_sniffer",
],
outs = ["run_sniffer_copy"],
cmd = "cat < $(SRCS) > $@",
)

go_test(
name = "sniffer_test",
srcs = ["sniffer_test.go"],
embedsrcs = [
":run_sniffer_copy", # keep
],
tags = [
"manual",
"noguitar",
"notap",
],
visibility = ["//:sandbox"],
deps = [
"//pkg/test/dockerutil",
"@com_github_docker_docker//api/types/mount:go_default_library",
],
deps = ["//pkg/test/dockerutil"],
)

go_test(
Expand Down
29 changes: 22 additions & 7 deletions test/gpu/cuda_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,10 +395,13 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm
}

// getContainerOpts returns the container run options to run CUDA tests.
func getContainerOpts() dockerutil.RunOpts {
opts := dockerutil.GPURunOpts()
func getContainerOpts() (dockerutil.RunOpts, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true})
if err != nil {
return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
}
opts.Image = "gpu/cuda-tests"
return opts
return opts, nil
}

// testLog logs a line as a test log.
Expand Down Expand Up @@ -446,7 +449,11 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error)
}
featuresContainer := dockerutil.MakeContainer(ctx, t)
defer featuresContainer.CleanUp(ctx)
featuresList, err := featuresContainer.Run(ctx, getContainerOpts(), "/list_features.sh")
runOpts, err := getContainerOpts()
if err != nil {
return nil, fmt.Errorf("failed to get container options: %w", err)
}
featuresList, err := featuresContainer.Run(ctx, runOpts, "/list_features.sh")
if err != nil {
return nil, fmt.Errorf("cannot get list of CUDA features: %v", err)
}
Expand Down Expand Up @@ -656,7 +663,11 @@ func TestCUDA(t *testing.T) {
// Get a list of sample tests.
listContainer := dockerutil.MakeContainer(ctx, t)
defer listContainer.CleanUp(ctx)
testsList, err := listContainer.Run(ctx, getContainerOpts(), "/list_sample_tests.sh")
runOpts, err := getContainerOpts()
if err != nil {
t.Fatalf("Failed to get container options: %v", err)
}
testsList, err := listContainer.Run(ctx, runOpts, "/list_sample_tests.sh")
if err != nil {
t.Fatalf("Cannot list sample tests: %v", err)
}
Expand Down Expand Up @@ -700,7 +711,11 @@ func TestCUDA(t *testing.T) {
for i := 0; i < numContainers; i++ {
spawnGroup.Go(func() error {
c := dockerutil.MakeContainer(ctx, t)
if err := c.Spawn(spawnCtx, getContainerOpts(), "/bin/sleep", "6h"); err != nil {
runOpts, err := getContainerOpts()
if err != nil {
return fmt.Errorf("failed to get container options: %w", err)
}
if err := c.Spawn(spawnCtx, runOpts, "/bin/sleep", "6h"); err != nil {
return fmt.Errorf("container %v failed to spawn: %w", c.Name, err)
}
containers[i] = c
Expand Down Expand Up @@ -790,7 +805,7 @@ func TestCUDA(t *testing.T) {
" $ docker run --runtime=%s --gpus=all -e %s --rm %s /run_sample %s",
dockerutil.Runtime(),
dockerutil.AllGPUCapabilities,
getContainerOpts().Image,
runOpts.Image,
failedTests[0],
)
}
Expand Down
5 changes: 4 additions & 1 deletion test/gpu/nccl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ import (
func runNCCL(ctx context.Context, t *testing.T, testName string) {
t.Helper()
c := dockerutil.MakeContainer(ctx, t)
opts := dockerutil.GPURunOpts()
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
}
opts.Image = "gpu/nccl-tests"
cmd := fmt.Sprintf("/nccl-tests/build/%s", testName)
out, err := c.Run(ctx, opts, cmd)
Expand Down
7 changes: 6 additions & 1 deletion test/gpu/ollama/ollama.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,12 @@ type dockerServer struct {
// NewDocker returns a new Ollama client talking to an Ollama server that runs
// in a local Docker container.
func NewDocker(ctx context.Context, cont *dockerutil.Container, logger testutil.Logger) (*Ollama, error) {
opts := dockerutil.GPURunOpts()
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
AllowIncompatibleIoctl: true,
})
if err != nil {
return nil, fmt.Errorf("failed to get GPU run options: %w", err)
}
opts.Image = "gpu/ollama"
started := time.Now()
if err := cont.Spawn(ctx, opts); err != nil {
Expand Down
5 changes: 4 additions & 1 deletion test/gpu/pytorch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ import (
func runPytorch(ctx context.Context, t *testing.T, scriptPath string, args ...string) {
t.Helper()
c := dockerutil.MakeContainer(ctx, t)
opts := dockerutil.GPURunOpts()
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
}
opts.Image = "gpu/pytorch"
cmd := append([]string{"python3", scriptPath}, args...)
out, err := c.Run(ctx, opts, cmd...)
Expand Down
10 changes: 8 additions & 2 deletions test/gpu/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ func TestGPUHello(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts := dockerutil.GPURunOpts()
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "basic/cuda-vector-add"
out, err := c.Run(ctx, opts)
if err != nil {
Expand All @@ -41,7 +44,10 @@ func TestCUDASmokeTests(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts := dockerutil.GPURunOpts()
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "gpu/cuda-tests"
out, err := c.Run(ctx, opts, "/run_smoke.sh")
if err != nil {
Expand Down
Loading

0 comments on commit 3363c61

Please sign in to comment.