From 8499bc222b78c8ae530b1a19b42e343ae501edff Mon Sep 17 00:00:00 2001 From: Etienne Perot Date: Tue, 3 Sep 2024 16:48:10 -0700 Subject: [PATCH] Integrate GPU `ioctl` sniffer in GPU tests. This wraps all GPU tests' command line with the nvproxy ioctl sniffer. This has multiple functions: - Verifying that the application does not call ioctls unsupported by nvproxy. This is controlled by a `AllowIncompatibleIoctl` option, which is initially set to `true` in all tests to mirror current behavior, but should be flipped as we verify that they do not call unsupported ioctls. - Verifying that the sniffer itself works transparently for a wide range of applications. - Later down the line, enforcing that the application only calls ioctls that are part of GPU capabilities that it has a need for. This is controlled by a capability string which is currently only used to set the `NVIDIA_DRIVER_CAPABILITIES` environment variable. Updates issue #10856 PiperOrigin-RevId: 670751227 --- Makefile | 1 + pkg/test/dockerutil/BUILD | 14 +++ pkg/test/dockerutil/container.go | 66 ++++++++++-- pkg/test/dockerutil/gpu.go | 110 ++++++++++++++++++-- test/gpu/BUILD | 19 +--- test/gpu/cuda_test.go | 29 ++++-- test/gpu/nccl_test.go | 5 +- test/gpu/ollama/ollama.go | 7 +- test/gpu/pytorch_test.go | 5 +- test/gpu/smoke_test.go | 14 ++- test/gpu/sniffer_test.go | 64 +++--------- test/gpu/sr_test.go | 5 +- test/gpu/stablediffusion/stablediffusion.go | 7 +- test/gpu/vllm/vllm_test.go | 5 +- test/root/cgroup_test.go | 20 +++- tools/ioctl_sniffer/BUILD | 3 +- 16 files changed, 262 insertions(+), 112 deletions(-) diff --git a/Makefile b/Makefile index bb22627816..34781b7cdd 100644 --- a/Makefile +++ b/Makefile @@ -177,6 +177,7 @@ dev: $(RUNTIME_BIN) ## Installs a set of local runtimes. Requires sudo. @$(call configure_noreload,$(RUNTIME)-p,--net-raw --profile) @$(call configure_noreload,$(RUNTIME)-cgroup-d,--net-raw --debug --strace --log-packets --cgroupfs) @$(call configure_noreload,$(RUNTIME)-systemd-d,--net-raw --debug --strace --log-packets --systemd-cgroup) + @$(call configure_noreload,$(RUNTIME)-gpu,--nvproxy) @$(call reload_docker) .PHONY: dev diff --git a/pkg/test/dockerutil/BUILD b/pkg/test/dockerutil/BUILD index 65aa5638b2..2b120c614f 100644 --- a/pkg/test/dockerutil/BUILD +++ b/pkg/test/dockerutil/BUILD @@ -5,6 +5,17 @@ package( licenses = ["notice"], ) +# We copy the `run_sniffer` binary here because `go:embed` can only embed +# from the current directory or subdirectories, not parents of it. +genrule( + name = "run_sniffer_bin", + srcs = [ + "//tools/ioctl_sniffer:run_sniffer", + ], + outs = ["run_sniffer_copy"], + cmd = "cat < $(SRCS) > $@", +) + go_library( name = "dockerutil", testonly = 1, @@ -16,6 +27,9 @@ go_library( "network.go", "profile.go", ], + embedsrcs = [ + ":run_sniffer_bin", # keep + ], visibility = ["//:sandbox"], deps = [ "//pkg/sync", diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go index 9bc3d5ae29..a83f5d322c 100644 --- a/pkg/test/dockerutil/container.go +++ b/pkg/test/dockerutil/container.go @@ -115,6 +115,10 @@ type RunOpts struct { DeviceRequests []container.DeviceRequest Devices []container.DeviceMapping + + // sniffGPUOpts, if set, sets the rules for GPU sniffing during this test. + // Must be set via `RunOpts.SniffGPU`. + sniffGPUOpts *SniffGPUOpts } func makeContainer(ctx context.Context, logger testutil.Logger, runtime string) *Container { @@ -164,7 +168,11 @@ func MakeNativeContainer(ctx context.Context, logger testutil.Logger) *Container // Spawn is analogous to 'docker run -d'. func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error { - if err := c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil); err != nil { + cfg, err := c.config(ctx, r, args) + if err != nil { + return fmt.Errorf("container config: %w", err) + } + if err := c.create(ctx, r.Image, cfg, c.hostConfig(r), nil); err != nil { return err } return c.Start(ctx) @@ -173,7 +181,10 @@ func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error // SpawnProcess is analogous to 'docker run -it'. It returns a process // which represents the root process. func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) (Process, error) { - config, hostconf, netconf := c.ConfigsFrom(r, args...) + config, hostconf, netconf, err := c.ConfigsFrom(ctx, r, args...) + if err != nil { + return Process{}, fmt.Errorf("container config: %w", err) + } config.Tty = true config.OpenStdin = true @@ -204,7 +215,11 @@ func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) // Run is analogous to 'docker run'. func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string, error) { - if err := c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil); err != nil { + cfg, err := c.config(ctx, r, args) + if err != nil { + return "", fmt.Errorf("container config: %w", err) + } + if err := c.create(ctx, r.Image, cfg, c.hostConfig(r), nil); err != nil { return "", err } @@ -223,8 +238,12 @@ func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string, // ConfigsFrom returns container configs from RunOpts and args. The caller should call 'CreateFrom' // and Start. -func (c *Container) ConfigsFrom(r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig) { - return c.config(r, args), c.hostConfig(r), &network.NetworkingConfig{} +func (c *Container) ConfigsFrom(ctx context.Context, r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig, error) { + cfg, err := c.config(ctx, r, args) + if err != nil { + return nil, nil, nil, fmt.Errorf("container config: %w", err) + } + return cfg, c.hostConfig(r), &network.NetworkingConfig{}, nil } // MakeLink formats a link to add to a RunOpts. @@ -239,7 +258,11 @@ func (c *Container) CreateFrom(ctx context.Context, profileImage string, conf *c // Create is analogous to 'docker create'. func (c *Container) Create(ctx context.Context, r RunOpts, args ...string) error { - return c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil) + cfg, err := c.config(ctx, r, args) + if err != nil { + return fmt.Errorf("container config: %w", err) + } + return c.create(ctx, r.Image, cfg, c.hostConfig(r), nil) } func (c *Container) create(ctx context.Context, profileImage string, conf *container.Config, hostconf *container.HostConfig, netconf *network.NetworkingConfig) error { @@ -271,7 +294,7 @@ func (c *Container) create(ctx context.Context, profileImage string, conf *conta return nil } -func (c *Container) config(r RunOpts, args []string) *container.Config { +func (c *Container) config(ctx context.Context, r RunOpts, args []string) (*container.Config, error) { ports := nat.PortSet{} for _, p := range r.Ports { port := nat.Port(fmt.Sprintf("%d", p)) @@ -279,15 +302,38 @@ func (c *Container) config(r RunOpts, args []string) *container.Config { } env := append(r.Env, fmt.Sprintf("RUNSC_TEST_NAME=%s", c.Name)) + image := testutil.ImageByName(r.Image) + entrypoint := r.Entrypoint + if r.sniffGPUOpts != nil { + c.cleanups = append(c.cleanups, func() { + r.sniffGPUOpts.cleanup() + }) + if len(entrypoint) == 0 && len(args) == 0 { + // Need to look up the image's default entrypoint/args so we can prepend to them. + // If we don't, then we will end up overwriting them. + imageInfo, _, err := c.client.ImageInspectWithRaw(ctx, image) + if err != nil { + return nil, fmt.Errorf("cannot inspect image %q: %w", image, err) + } + entrypoint = []string(imageInfo.Config.Entrypoint) + args = []string(imageInfo.Config.Cmd) + } + if len(entrypoint) != 0 { + entrypoint = r.sniffGPUOpts.prepend(entrypoint) + } else { + args = r.sniffGPUOpts.prepend(args) + } + } + return &container.Config{ - Image: testutil.ImageByName(r.Image), + Image: image, Cmd: args, - Entrypoint: r.Entrypoint, + Entrypoint: entrypoint, ExposedPorts: ports, Env: env, WorkingDir: r.WorkDir, User: r.User, - } + }, nil } func (c *Container) hostConfig(r RunOpts) *container.HostConfig { diff --git a/pkg/test/dockerutil/gpu.go b/pkg/test/dockerutil/gpu.go index eb86ecd462..1ac95f4b9f 100644 --- a/pkg/test/dockerutil/gpu.go +++ b/pkg/test/dockerutil/gpu.go @@ -22,6 +22,9 @@ import ( "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/mount" + + // Needed for go:embed + _ "embed" ) // Flags. @@ -29,15 +32,57 @@ var ( setCOSGPU = flag.Bool("cos-gpu", false, "set to configure GPU settings for COS, as opposed to Docker") ) -// AllGPUCapabilities is the environment variable that enables all NVIDIA GPU -// capabilities within a container. -const AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all" +//go:embed run_sniffer_copy +var runSnifferBinary []byte + +const ( + // ioctlSnifferMountPath is the in-container path at which the ioctl sniffer is mounted. + ioctlSnifferMountPath = "/ioctl_sniffer" +) + +const ( + // AllGPUCapabilities is the environment variable that enables all NVIDIA + // GPU capabilities within a container. + AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all" + + // DefaultGPUCapabilities is the environment variable that enables default + // NVIDIA GPU capabilities within a container. + DefaultGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=compute,utility" +) // GPURunOpts returns Docker run options with GPU support enabled. -func GPURunOpts() RunOpts { +func GPURunOpts(sniffGPUOpts SniffGPUOpts) (RunOpts, error) { + // Extract the sniffer binary to a temporary location. + runSniffer, err := os.CreateTemp("", "run_sniffer.*") + if err != nil { + return RunOpts{}, fmt.Errorf("failed to create temporary file: %w", err) + } + if _, err := runSniffer.Write(runSnifferBinary); err != nil { + return RunOpts{}, fmt.Errorf("failed to write to temporary file: %w", err) + } + if err := runSniffer.Sync(); err != nil { + return RunOpts{}, fmt.Errorf("failed to sync temporary file: %w", err) + } + if err := runSniffer.Chmod(0o555); err != nil { + return RunOpts{}, fmt.Errorf("failed to chmod temporary file: %w", err) + } + if err := runSniffer.Close(); err != nil { + return RunOpts{}, fmt.Errorf("failed to close temporary file: %w", err) + } + sniffGPUOpts.runSniffer = runSniffer + mounts := []mount.Mount{ + { + Source: runSniffer.Name(), + Target: ioctlSnifferMountPath, + Type: mount.TypeBind, + ReadOnly: true, + }, + } + gpuEnv := []string{sniffGPUOpts.GPUCapabilities()} + if !*setCOSGPU { return RunOpts{ - Env: []string{AllGPUCapabilities}, + Env: gpuEnv, DeviceRequests: []container.DeviceRequest{ { Count: -1, @@ -45,7 +90,9 @@ func GPURunOpts() RunOpts { Options: map[string]string{}, }, }, - } + Mounts: mounts, + sniffGPUOpts: &sniffGPUOpts, + }, nil } // COS has specific settings since it has a custom installer for GPU drivers. @@ -68,7 +115,6 @@ func GPURunOpts() RunOpts { }) } - var mounts []mount.Mount for _, nvidiaBin := range []string{ "/home/kubernetes/bin/nvidia/bin", "/var/lib/nvidia/bin", @@ -97,10 +143,54 @@ func GPURunOpts() RunOpts { } return RunOpts{ - Env: []string{AllGPUCapabilities}, - Mounts: mounts, - Devices: devices, + Env: gpuEnv, + Mounts: mounts, + Devices: devices, + sniffGPUOpts: &sniffGPUOpts, + }, nil +} + +// SniffGPUOpts dictates options to sniffer GPU workloads. +type SniffGPUOpts struct { + // If true, the test will not fail even when the workload calls incompatible + // ioctls. Useful for debugging. + // TODO(b/340955577): Should be converted to a flag and removed from this + // struct once all GPU tests have no incompatible ioctls. + AllowIncompatibleIoctl bool + + // The set of GPU capabilities exposed to the container. + // If unset, defaults to `DefaultGPUCapabilities`. + Capabilities string + + // The fields below are set internally. + runSniffer *os.File +} + +// GPUCapabilities returns the set of GPU capabilities meant to be +// exposed to the container. +func (sgo *SniffGPUOpts) GPUCapabilities() string { + if sgo.Capabilities == "" { + return DefaultGPUCapabilities + } + return sgo.Capabilities +} + +// prepend prepends the sniffer arguments to the given command. +func (sgo *SniffGPUOpts) prepend(argv []string) []string { + snifferArgv := []string{ + ioctlSnifferMountPath, + "--verbose=true", + fmt.Sprintf("--enforce_compatibility=%t", !sgo.AllowIncompatibleIoctl), + // TODO(eperot): Add flag to enforce capability set here once implemented. + } + return append(snifferArgv, argv...) +} + +func (sgo *SniffGPUOpts) cleanup() error { + if err := os.Remove(sgo.runSniffer.Name()); err != nil { + return fmt.Errorf("failed to unlink temporary file %q: %w", sgo.runSniffer.Name(), err) } + return nil } // NumGPU crudely estimates the number of NVIDIA GPUs on the host. diff --git a/test/gpu/BUILD b/test/gpu/BUILD index 0fd44e2cc7..25317239ea 100644 --- a/test/gpu/BUILD +++ b/test/gpu/BUILD @@ -103,33 +103,16 @@ go_test( deps = ["//test/gpu/stablediffusion"], ) -# We copy the `run_sniffer` binary here because `go:embed` can only embed -# from the current directory or subdirectories, not parents of it. -genrule( - name = "run_sniffer_copy", - srcs = [ - "//tools/ioctl_sniffer:run_sniffer", - ], - outs = ["run_sniffer_copy"], - cmd = "cat < $(SRCS) > $@", -) - go_test( name = "sniffer_test", srcs = ["sniffer_test.go"], - embedsrcs = [ - ":run_sniffer_copy", # keep - ], tags = [ "manual", "noguitar", "notap", ], visibility = ["//:sandbox"], - deps = [ - "//pkg/test/dockerutil", - "@com_github_docker_docker//api/types/mount:go_default_library", - ], + deps = ["//pkg/test/dockerutil"], ) go_test( diff --git a/test/gpu/cuda_test.go b/test/gpu/cuda_test.go index 141e028a92..9762f5527b 100644 --- a/test/gpu/cuda_test.go +++ b/test/gpu/cuda_test.go @@ -395,10 +395,13 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm } // getContainerOpts returns the container run options to run CUDA tests. -func getContainerOpts() dockerutil.RunOpts { - opts := dockerutil.GPURunOpts() +func getContainerOpts() (dockerutil.RunOpts, error) { + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err) + } opts.Image = "gpu/cuda-tests" - return opts + return opts, nil } // testLog logs a line as a test log. @@ -446,7 +449,11 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error) } featuresContainer := dockerutil.MakeContainer(ctx, t) defer featuresContainer.CleanUp(ctx) - featuresList, err := featuresContainer.Run(ctx, getContainerOpts(), "/list_features.sh") + runOpts, err := getContainerOpts() + if err != nil { + return nil, fmt.Errorf("failed to get container options: %w", err) + } + featuresList, err := featuresContainer.Run(ctx, runOpts, "/list_features.sh") if err != nil { return nil, fmt.Errorf("cannot get list of CUDA features: %v", err) } @@ -656,7 +663,11 @@ func TestCUDA(t *testing.T) { // Get a list of sample tests. listContainer := dockerutil.MakeContainer(ctx, t) defer listContainer.CleanUp(ctx) - testsList, err := listContainer.Run(ctx, getContainerOpts(), "/list_sample_tests.sh") + runOpts, err := getContainerOpts() + if err != nil { + t.Fatalf("Failed to get container options: %v", err) + } + testsList, err := listContainer.Run(ctx, runOpts, "/list_sample_tests.sh") if err != nil { t.Fatalf("Cannot list sample tests: %v", err) } @@ -700,7 +711,11 @@ func TestCUDA(t *testing.T) { for i := 0; i < numContainers; i++ { spawnGroup.Go(func() error { c := dockerutil.MakeContainer(ctx, t) - if err := c.Spawn(spawnCtx, getContainerOpts(), "/bin/sleep", "6h"); err != nil { + runOpts, err := getContainerOpts() + if err != nil { + return fmt.Errorf("failed to get container options: %w", err) + } + if err := c.Spawn(spawnCtx, runOpts, "/bin/sleep", "6h"); err != nil { return fmt.Errorf("container %v failed to spawn: %w", c.Name, err) } containers[i] = c @@ -790,7 +805,7 @@ func TestCUDA(t *testing.T) { " $ docker run --runtime=%s --gpus=all -e %s --rm %s /run_sample %s", dockerutil.Runtime(), dockerutil.AllGPUCapabilities, - getContainerOpts().Image, + runOpts.Image, failedTests[0], ) } diff --git a/test/gpu/nccl_test.go b/test/gpu/nccl_test.go index 8b9ee18abe..74c7772b9c 100644 --- a/test/gpu/nccl_test.go +++ b/test/gpu/nccl_test.go @@ -27,7 +27,10 @@ import ( func runNCCL(ctx context.Context, t *testing.T, testName string) { t.Helper() c := dockerutil.MakeContainer(ctx, t) - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + t.Fatalf("Failed to get GPU run options: %v", err) + } opts.Image = "gpu/nccl-tests" cmd := fmt.Sprintf("/nccl-tests/build/%s", testName) out, err := c.Run(ctx, opts, cmd) diff --git a/test/gpu/ollama/ollama.go b/test/gpu/ollama/ollama.go index 8326105df9..172cf278e2 100644 --- a/test/gpu/ollama/ollama.go +++ b/test/gpu/ollama/ollama.go @@ -149,7 +149,12 @@ type dockerServer struct { // NewDocker returns a new Ollama client talking to an Ollama server that runs // in a local Docker container. func NewDocker(ctx context.Context, cont *dockerutil.Container, logger testutil.Logger) (*Ollama, error) { - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ + AllowIncompatibleIoctl: true, + }) + if err != nil { + return nil, fmt.Errorf("failed to get GPU run options: %w", err) + } opts.Image = "gpu/ollama" started := time.Now() if err := cont.Spawn(ctx, opts); err != nil { diff --git a/test/gpu/pytorch_test.go b/test/gpu/pytorch_test.go index 04f617754b..15b5ffa3a3 100644 --- a/test/gpu/pytorch_test.go +++ b/test/gpu/pytorch_test.go @@ -26,7 +26,10 @@ import ( func runPytorch(ctx context.Context, t *testing.T, scriptPath string, args ...string) { t.Helper() c := dockerutil.MakeContainer(ctx, t) - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + t.Fatalf("Failed to get GPU run options: %v", err) + } opts.Image = "gpu/pytorch" cmd := append([]string{"python3", scriptPath}, args...) out, err := c.Run(ctx, opts, cmd...) diff --git a/test/gpu/smoke_test.go b/test/gpu/smoke_test.go index 5e273357cb..a3236dafe7 100644 --- a/test/gpu/smoke_test.go +++ b/test/gpu/smoke_test.go @@ -27,13 +27,16 @@ func TestGPUHello(t *testing.T) { c := dockerutil.MakeContainer(ctx, t) defer c.CleanUp(ctx) - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + t.Fatalf("failed to get GPU run options: %v", err) + } opts.Image = "basic/cuda-vector-add" out, err := c.Run(ctx, opts) + t.Logf("cuda-vector-add output: %s", string(out)) if err != nil { t.Fatalf("could not run cuda-vector-add: %v", err) } - t.Logf("cuda-vector-add output: %s", string(out)) } func TestCUDASmokeTests(t *testing.T) { @@ -41,11 +44,14 @@ func TestCUDASmokeTests(t *testing.T) { c := dockerutil.MakeContainer(ctx, t) defer c.CleanUp(ctx) - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + t.Fatalf("failed to get GPU run options: %v", err) + } opts.Image = "gpu/cuda-tests" out, err := c.Run(ctx, opts, "/run_smoke.sh") + t.Logf("cuda-tests smoke tests output: %s", string(out)) if err != nil { t.Fatalf("could not run cuda-tests smoke tests: %v", err) } - t.Logf("cuda-tests smoke tests output: %s", string(out)) } diff --git a/test/gpu/sniffer_test.go b/test/gpu/sniffer_test.go index 6d7e088af2..c89254f002 100644 --- a/test/gpu/sniffer_test.go +++ b/test/gpu/sniffer_test.go @@ -18,70 +18,32 @@ package sniffer_test import ( "context" "errors" - "os" + "fmt" "strings" "testing" "time" - "github.com/docker/docker/api/types/mount" "gvisor.dev/gvisor/pkg/test/dockerutil" - - // Needed for go:embed - _ "embed" ) const maxDuration = 1 * time.Minute -//go:embed run_sniffer_copy -var runSnifferBinary []byte - -// RunCommand runs the given command via the sniffer, with the -enforce_compatibility flag. -// -// It's run in a docker container, with the cuda-tests image. +// runCUDATestsCommand runs the given command via the sniffer with compatibility +// enforcement enabled. +// It's run in a docker container, with the cuda-tests image. func runCUDATestsCommand(t *testing.T, cmd ...string) (string, error) { - // Extract the sniffer binary to a temporary location. - runSniffer, err := os.CreateTemp("/tmp", "run_sniffer.*") - if err != nil { - t.Fatalf("Failed to create temporary file: %v", err) - } - defer func() { - if err := runSniffer.Close(); err != nil { - t.Fatalf("Failed to close temporary file: %v", err) - } - if err := os.Remove(runSniffer.Name()); err != nil { - t.Fatalf("Failed to unlink temporary file: %v", err) - } - }() - if _, err := runSniffer.Write(runSnifferBinary); err != nil { - t.Fatalf("Failed to write to temporary file: %v", err) - } - if err := runSniffer.Sync(); err != nil { - t.Fatalf("Failed to sync temporary file: %v", err) - } - if err := runSniffer.Chmod(0o555); err != nil { - t.Fatalf("Failed to chmod temporary file: %v", err) - } - - // Set up our docker container ctx, cancel := context.WithTimeoutCause(context.Background(), maxDuration, errors.New("overall test timed out")) defer cancel() - - listContainer := dockerutil.MakeContainer(ctx, t) - defer listContainer.CleanUp(ctx) - - // Mount the sniffer binary into the container - opts := dockerutil.GPURunOpts() - opts.Image = "gpu/cuda-tests" - opts.Mounts = append(opts.Mounts, mount.Mount{ - Type: mount.TypeBind, - Source: runSniffer.Name(), - Target: "/run_sniffer", - ReadOnly: false, + container := dockerutil.MakeContainer(ctx, t) + defer container.CleanUp(ctx) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ + AllowIncompatibleIoctl: false, }) - - command := append([]string{"/run_sniffer", "-enforce_compatibility", "-verbose"}, cmd...) - output, err := listContainer.Run(ctx, opts, command...) - return output, err + if err != nil { + return "", fmt.Errorf("failed to get GPU run options: %w", err) + } + opts.Image = "gpu/cuda-tests" + return container.Run(ctx, opts, cmd...) } func TestSupportedCUDAProgram(t *testing.T) { diff --git a/test/gpu/sr_test.go b/test/gpu/sr_test.go index 3f98315e16..5f9db3dd27 100644 --- a/test/gpu/sr_test.go +++ b/test/gpu/sr_test.go @@ -34,7 +34,10 @@ func TestGPUCheckpointRestore(t *testing.T) { c := dockerutil.MakeContainer(ctx, t) defer c.CleanUp(ctx) - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + t.Fatalf("failed to get GPU run options: %v", err) + } opts.Image = "basic/cuda-vector-add" if err := c.Spawn(ctx, opts, "sleep", "infinity"); err != nil { t.Fatalf("could not run cuda-vector-add: %v", err) diff --git a/test/gpu/stablediffusion/stablediffusion.go b/test/gpu/stablediffusion/stablediffusion.go index 9b6cba9ab0..511b00d535 100644 --- a/test/gpu/stablediffusion/stablediffusion.go +++ b/test/gpu/stablediffusion/stablediffusion.go @@ -47,7 +47,12 @@ type dockerRunner struct { func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) { cont := dockerutil.MakeContainer(ctx, dr.logger) defer cont.CleanUp(ctx) - opts := dockerutil.GPURunOpts() + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ + AllowIncompatibleIoctl: true, + }) + if err != nil { + return nil, fmt.Errorf("failed to get GPU run options: %w", err) + } opts.Image = image if err := cont.Spawn(ctx, opts, argv...); err != nil { return nil, fmt.Errorf("could not start Stable Diffusion container: %v", err) diff --git a/test/gpu/vllm/vllm_test.go b/test/gpu/vllm/vllm_test.go index 67ebc4fa23..d4d55c1382 100644 --- a/test/gpu/vllm/vllm_test.go +++ b/test/gpu/vllm/vllm_test.go @@ -51,7 +51,10 @@ func doVLLMTest(b *testing.B) { } // Run vllm. - runOpts := dockerutil.GPURunOpts() + runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{AllowIncompatibleIoctl: true}) + if err != nil { + b.Fatalf("failed to get GPU run options: %v", err) + } runOpts.Image = "gpu/vllm" runOpts.Env = []string{"PYTHONPATH=$PYTHONPATH:/vllm"} diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go index f4133af671..fab4db3d3d 100644 --- a/test/root/cgroup_test.go +++ b/test/root/cgroup_test.go @@ -250,9 +250,12 @@ func TestCgroupV1(t *testing.T) { } // Make configs. - conf, hostconf, _ := d.ConfigsFrom(dockerutil.RunOpts{ + conf, hostconf, _, err := d.ConfigsFrom(ctx, dockerutil.RunOpts{ Image: "basic/alpine", }, "sleep", "10000") + if err != nil { + t.Fatalf("Cannot get container config: %v", err) + } // Add Cgroup arguments to configs. for _, attr := range attrs { @@ -416,9 +419,12 @@ func TestCgroupV2(t *testing.T) { baseCgroupPath = cgroupPath("system.slice") } // Make configs. - conf, hostconf, _ := d.ConfigsFrom(dockerutil.RunOpts{ + conf, hostconf, _, err := d.ConfigsFrom(ctx, dockerutil.RunOpts{ Image: "basic/alpine", }, "sleep", "10000") + if err != nil { + t.Fatalf("Cannot get container config: %v", err) + } // Add Cgroup arguments to configs. for _, attr := range attrs { @@ -515,9 +521,12 @@ func TestCgroupParent(t *testing.T) { if useSystemd { parent = "system-runsc.slice" } - conf, hostconf, _ := d.ConfigsFrom(dockerutil.RunOpts{ + conf, hostconf, _, err := d.ConfigsFrom(ctx, dockerutil.RunOpts{ Image: "basic/alpine", }, "sleep", "10000") + if err != nil { + t.Fatalf("Cannot get container config: %v", err) + } hostconf.Resources.CgroupParent = parent if err := d.CreateFrom(ctx, "basic/alpine", conf, hostconf, nil); err != nil { @@ -571,9 +580,12 @@ func TestSystemdCgroupJoinTwice(t *testing.T) { // Construct a known cgroup name. parent := "system-runsc.slice" - conf, hostconf, _ := d.ConfigsFrom(dockerutil.RunOpts{ + conf, hostconf, _, err := d.ConfigsFrom(ctx, dockerutil.RunOpts{ Image: "basic/alpine", }, "sleep", "10000") + if err != nil { + t.Fatalf("Cannot get container config: %v", err) + } hostconf.Resources.CgroupParent = parent if err := d.CreateFrom(ctx, "basic/alpine", conf, hostconf, nil); err != nil { diff --git a/tools/ioctl_sniffer/BUILD b/tools/ioctl_sniffer/BUILD index 0c2a4038dd..a34e391421 100644 --- a/tools/ioctl_sniffer/BUILD +++ b/tools/ioctl_sniffer/BUILD @@ -48,8 +48,7 @@ go_binary( nogo = False, static = True, visibility = [ - "//test:__subpackages__", - "//tools/ioctl_sniffer:__subpackages__", + "//:sandbox", ], deps = [ "//pkg/log",