Skip to content

Commit

Permalink
Integrate GPU ioctl sniffer in GPU tests.
Browse files Browse the repository at this point in the history
This wraps all GPU tests' command line with the nvproxy ioctl sniffer.

This has multiple functions:

- Verifying that the application does not call ioctls unsupported by
  nvproxy. This is controlled by a `AllowIncompatibleIoctl` option, which
  is initially set to `true` in all tests to mirror current behavior, but
  should be flipped as we verify that they do not call unsupported ioctls.
- Verifying that the sniffer itself works transparently for a wide range
  of applications.
- Later down the line, enforcing that the application only calls ioctls
  that are part of GPU capabilities that it has a need for. This is
  controlled by a capability string which is currently only used to set
  the `NVIDIA_DRIVER_CAPABILITIES` environment variable.

Updates issue #10856

PiperOrigin-RevId: 670751227
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Sep 9, 2024
1 parent 905d769 commit b72df79
Show file tree
Hide file tree
Showing 16 changed files with 275 additions and 112 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ dev: $(RUNTIME_BIN) ## Installs a set of local runtimes. Requires sudo.
@$(call configure_noreload,$(RUNTIME)-p,--net-raw --profile)
@$(call configure_noreload,$(RUNTIME)-cgroup-d,--net-raw --debug --strace --log-packets --cgroupfs)
@$(call configure_noreload,$(RUNTIME)-systemd-d,--net-raw --debug --strace --log-packets --systemd-cgroup)
@$(call configure_noreload,$(RUNTIME)-gpu,--nvproxy)
@$(call reload_docker)
.PHONY: dev

Expand Down
14 changes: 14 additions & 0 deletions pkg/test/dockerutil/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ package(
licenses = ["notice"],
)

# We copy the `run_sniffer` binary here because `go:embed` can only embed
# from the current directory or subdirectories, not parents of it.
genrule(
name = "run_sniffer_bin",
srcs = [
"//tools/ioctl_sniffer:run_sniffer",
],
outs = ["run_sniffer_copy"],
cmd = "cat < $(SRCS) > $@",
)

go_library(
name = "dockerutil",
testonly = 1,
Expand All @@ -16,6 +27,9 @@ go_library(
"network.go",
"profile.go",
],
embedsrcs = [
":run_sniffer_bin", # keep
],
visibility = ["//:sandbox"],
deps = [
"//pkg/sync",
Expand Down
66 changes: 56 additions & 10 deletions pkg/test/dockerutil/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ type RunOpts struct {
DeviceRequests []container.DeviceRequest

Devices []container.DeviceMapping

// sniffGPUOpts, if set, sets the rules for GPU sniffing during this test.
// Must be set via `RunOpts.SniffGPU`.
sniffGPUOpts *SniffGPUOpts
}

func makeContainer(ctx context.Context, logger testutil.Logger, runtime string) *Container {
Expand Down Expand Up @@ -164,7 +168,11 @@ func MakeNativeContainer(ctx context.Context, logger testutil.Logger) *Container

// Spawn is analogous to 'docker run -d'.
func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error {
if err := c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil); err != nil {
cfg, err := c.config(ctx, r, args)
if err != nil {
return fmt.Errorf("container config: %w", err)
}
if err := c.create(ctx, r.Image, cfg, c.hostConfig(r), nil); err != nil {
return err
}
return c.Start(ctx)
Expand All @@ -173,7 +181,10 @@ func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error
// SpawnProcess is analogous to 'docker run -it'. It returns a process
// which represents the root process.
func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) (Process, error) {
config, hostconf, netconf := c.ConfigsFrom(r, args...)
config, hostconf, netconf, err := c.ConfigsFrom(ctx, r, args...)
if err != nil {
return Process{}, fmt.Errorf("container config: %w", err)
}
config.Tty = true
config.OpenStdin = true

Expand Down Expand Up @@ -204,7 +215,11 @@ func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string)

// Run is analogous to 'docker run'.
func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string, error) {
if err := c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil); err != nil {
cfg, err := c.config(ctx, r, args)
if err != nil {
return "", fmt.Errorf("container config: %w", err)
}
if err := c.create(ctx, r.Image, cfg, c.hostConfig(r), nil); err != nil {
return "", err
}

Expand All @@ -223,8 +238,12 @@ func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string,

// ConfigsFrom returns container configs from RunOpts and args. The caller should call 'CreateFrom'
// and Start.
func (c *Container) ConfigsFrom(r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig) {
return c.config(r, args), c.hostConfig(r), &network.NetworkingConfig{}
func (c *Container) ConfigsFrom(ctx context.Context, r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig, error) {
cfg, err := c.config(ctx, r, args)
if err != nil {
return nil, nil, nil, fmt.Errorf("container config: %w", err)
}
return cfg, c.hostConfig(r), &network.NetworkingConfig{}, nil
}

// MakeLink formats a link to add to a RunOpts.
Expand All @@ -239,7 +258,11 @@ func (c *Container) CreateFrom(ctx context.Context, profileImage string, conf *c

// Create is analogous to 'docker create'.
func (c *Container) Create(ctx context.Context, r RunOpts, args ...string) error {
return c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil)
cfg, err := c.config(ctx, r, args)
if err != nil {
return fmt.Errorf("container config: %w", err)
}
return c.create(ctx, r.Image, cfg, c.hostConfig(r), nil)
}

func (c *Container) create(ctx context.Context, profileImage string, conf *container.Config, hostconf *container.HostConfig, netconf *network.NetworkingConfig) error {
Expand Down Expand Up @@ -271,23 +294,46 @@ func (c *Container) create(ctx context.Context, profileImage string, conf *conta
return nil
}

func (c *Container) config(r RunOpts, args []string) *container.Config {
func (c *Container) config(ctx context.Context, r RunOpts, args []string) (*container.Config, error) {
ports := nat.PortSet{}
for _, p := range r.Ports {
port := nat.Port(fmt.Sprintf("%d", p))
ports[port] = struct{}{}
}
env := append(r.Env, fmt.Sprintf("RUNSC_TEST_NAME=%s", c.Name))

image := testutil.ImageByName(r.Image)
entrypoint := r.Entrypoint
if r.sniffGPUOpts != nil {
c.cleanups = append(c.cleanups, func() {
r.sniffGPUOpts.cleanup()
})
if len(entrypoint) == 0 && len(args) == 0 {
// Need to look up the image's default entrypoint/args so we can prepend to them.
// If we don't, then we will end up overwriting them.
imageInfo, _, err := c.client.ImageInspectWithRaw(ctx, image)
if err != nil {
return nil, fmt.Errorf("cannot inspect image %q: %w", image, err)
}
entrypoint = []string(imageInfo.Config.Entrypoint)
args = []string(imageInfo.Config.Cmd)
}
if len(entrypoint) != 0 {
entrypoint = r.sniffGPUOpts.prepend(entrypoint)
} else {
args = r.sniffGPUOpts.prepend(args)
}
}

return &container.Config{
Image: testutil.ImageByName(r.Image),
Image: image,
Cmd: args,
Entrypoint: r.Entrypoint,
Entrypoint: entrypoint,
ExposedPorts: ports,
Env: env,
WorkingDir: r.WorkDir,
User: r.User,
}
}, nil
}

func (c *Container) hostConfig(r RunOpts) *container.HostConfig {
Expand Down
121 changes: 111 additions & 10 deletions pkg/test/dockerutil/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,78 @@ import (

"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/mount"

// Needed for go:embed
_ "embed"
)

// Flags.
var (
setCOSGPU = flag.Bool("cos-gpu", false, "set to configure GPU settings for COS, as opposed to Docker")
)

// AllGPUCapabilities is the environment variable that enables all NVIDIA GPU
// capabilities within a container.
const AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"
//go:embed run_sniffer_copy
var runSnifferBinary []byte

const (
// ioctlSnifferMountPath is the in-container path at which the ioctl sniffer is mounted.
ioctlSnifferMountPath = "/ioctl_sniffer"
)

const (
// AllGPUCapabilities is the environment variable that enables all NVIDIA
// GPU capabilities within a container.
AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"

// DefaultGPUCapabilities is the environment variable that enables default
// NVIDIA GPU capabilities within a container.
DefaultGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=compute,utility"
)

// GPURunOpts returns Docker run options with GPU support enabled.
func GPURunOpts() RunOpts {
func GPURunOpts(sniffGPUOpts SniffGPUOpts) (RunOpts, error) {
var mounts []mount.Mount
if sniffGPUOpts.DisableSnifferReason == "" {
// Extract the sniffer binary to a temporary location.
runSniffer, err := os.CreateTemp("", "run_sniffer.*")
if err != nil {
return RunOpts{}, fmt.Errorf("failed to create temporary file: %w", err)
}
if _, err := runSniffer.Write(runSnifferBinary); err != nil {
return RunOpts{}, fmt.Errorf("failed to write to temporary file: %w", err)
}
if err := runSniffer.Sync(); err != nil {
return RunOpts{}, fmt.Errorf("failed to sync temporary file: %w", err)
}
if err := runSniffer.Chmod(0o555); err != nil {
return RunOpts{}, fmt.Errorf("failed to chmod temporary file: %w", err)
}
if err := runSniffer.Close(); err != nil {
return RunOpts{}, fmt.Errorf("failed to close temporary file: %w", err)
}
sniffGPUOpts.runSniffer = runSniffer
mounts = append(mounts, mount.Mount{
Source: runSniffer.Name(),
Target: ioctlSnifferMountPath,
Type: mount.TypeBind,
ReadOnly: true,
})
}
gpuEnv := []string{sniffGPUOpts.GPUCapabilities()}

if !*setCOSGPU {
return RunOpts{
Env: []string{AllGPUCapabilities},
Env: gpuEnv,
DeviceRequests: []container.DeviceRequest{
{
Count: -1,
Capabilities: [][]string{{"gpu"}},
Options: map[string]string{},
},
},
}
Mounts: mounts,
sniffGPUOpts: &sniffGPUOpts,
}, nil
}

// COS has specific settings since it has a custom installer for GPU drivers.
Expand All @@ -68,7 +116,6 @@ func GPURunOpts() RunOpts {
})
}

var mounts []mount.Mount
for _, nvidiaBin := range []string{
"/home/kubernetes/bin/nvidia/bin",
"/var/lib/nvidia/bin",
Expand Down Expand Up @@ -97,10 +144,64 @@ func GPURunOpts() RunOpts {
}

return RunOpts{
Env: []string{AllGPUCapabilities},
Mounts: mounts,
Devices: devices,
Env: gpuEnv,
Mounts: mounts,
Devices: devices,
sniffGPUOpts: &sniffGPUOpts,
}, nil
}

// SniffGPUOpts dictates options to sniffer GPU workloads.
type SniffGPUOpts struct {
// If set, explains why the sniffer should be disabled for this test.
// If unset or empty, the sniffer is enabled.
DisableSnifferReason string

// If true, the test will not fail even when the workload calls incompatible
// ioctls. Useful for debugging.
// TODO(b/340955577): Should be converted to a flag and removed from this
// struct once all GPU tests have no incompatible ioctls.
AllowIncompatibleIoctl bool

// The set of GPU capabilities exposed to the container.
// If unset, defaults to `DefaultGPUCapabilities`.
Capabilities string

// The fields below are set internally.
runSniffer *os.File
}

// GPUCapabilities returns the set of GPU capabilities meant to be
// exposed to the container.
func (sgo *SniffGPUOpts) GPUCapabilities() string {
if sgo.Capabilities == "" {
return DefaultGPUCapabilities
}
return sgo.Capabilities
}

// prepend prepends the sniffer arguments to the given command.
func (sgo *SniffGPUOpts) prepend(argv []string) []string {
if sgo.DisableSnifferReason != "" {
return argv
}
snifferArgv := []string{
ioctlSnifferMountPath,
"--verbose=true",
fmt.Sprintf("--enforce_compatibility=%t", !sgo.AllowIncompatibleIoctl),
// TODO(eperot): Add flag to enforce capability set here once implemented.
}
return append(snifferArgv, argv...)
}

func (sgo *SniffGPUOpts) cleanup() error {
if sgo.DisableSnifferReason != "" {
return nil // Sniffer disabled, so nothing to clean up.
}
if err := os.Remove(sgo.runSniffer.Name()); err != nil {
return fmt.Errorf("failed to unlink temporary file %q: %w", sgo.runSniffer.Name(), err)
}
return nil
}

// NumGPU crudely estimates the number of NVIDIA GPUs on the host.
Expand Down
19 changes: 1 addition & 18 deletions test/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -103,33 +103,16 @@ go_test(
deps = ["//test/gpu/stablediffusion"],
)

# We copy the `run_sniffer` binary here because `go:embed` can only embed
# from the current directory or subdirectories, not parents of it.
genrule(
name = "run_sniffer_copy",
srcs = [
"//tools/ioctl_sniffer:run_sniffer",
],
outs = ["run_sniffer_copy"],
cmd = "cat < $(SRCS) > $@",
)

go_test(
name = "sniffer_test",
srcs = ["sniffer_test.go"],
embedsrcs = [
":run_sniffer_copy", # keep
],
tags = [
"manual",
"noguitar",
"notap",
],
visibility = ["//:sandbox"],
deps = [
"//pkg/test/dockerutil",
"@com_github_docker_docker//api/types/mount:go_default_library",
],
deps = ["//pkg/test/dockerutil"],
)

go_test(
Expand Down
Loading

0 comments on commit b72df79

Please sign in to comment.