Skip to content

Commit

Permalink
Merge pull request #314 from elezar/CNT-4032/mulitple-naming-strategies
Browse files Browse the repository at this point in the history
Allow multiple naming strategies when generating CDI specification
  • Loading branch information
elezar authored Feb 13, 2024
2 parents a8d4880 + b6efd30 commit 355997d
Show file tree
Hide file tree
Showing 14 changed files with 128 additions and 64 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

* Add a `--spec-dir` option to the `nvidia-ctk cdi generate` command. This allows specs outside of `/etc/cdi` and `/var/run/cdi` to be processed.
* Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`.
* Allow multiple device naming strategies for `nvidia-ctk cdi generate` command. This allows a single
CDI spec to be generated that includes GPUs by index and UUID.
* Set the default `--device-name-strategy` for the `nvidia-ctk cdi generate` command to `[index, uuid]`.

## v1.15.0-rc.3
* Fix bug in `nvidia-ctk hook update-ldcache` where default `--ldconfig-path` value was not applied.
Expand Down
48 changes: 27 additions & 21 deletions cmd/nvidia-ctk/cdi/generate/generate.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@ type command struct {
}

type options struct {
output string
format string
deviceNameStrategy string
driverRoot string
devRoot string
nvidiaCTKPath string
ldconfigPath string
mode string
vendor string
class string
output string
format string
deviceNameStrategies cli.StringSlice
driverRoot string
devRoot string
nvidiaCTKPath string
ldconfigPath string
mode string
vendor string
class string

librarySearchPaths cli.StringSlice

Expand Down Expand Up @@ -109,11 +109,11 @@ func (m command) build() *cli.Command {
Usage: "Specify the root where `/dev` is located. If this is not specified, the driver-root is assumed.",
Destination: &opts.devRoot,
},
&cli.StringFlag{
&cli.StringSliceFlag{
Name: "device-name-strategy",
Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]",
Value: nvcdi.DeviceNameStrategyIndex,
Destination: &opts.deviceNameStrategy,
Usage: "Specify the strategy for generating device names. If this is specified multiple times, the devices will be duplicated for each strategy. One of [index | uuid | type-index]",
Value: cli.NewStringSlice(nvcdi.DeviceNameStrategyIndex, nvcdi.DeviceNameStrategyUUID),
Destination: &opts.deviceNameStrategies,
},
&cli.StringFlag{
Name: "driver-root",
Expand Down Expand Up @@ -185,9 +185,11 @@ func (m command) validateFlags(c *cli.Context, opts *options) error {
return fmt.Errorf("invalid discovery mode: %v", opts.mode)
}

_, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy)
if err != nil {
return err
for _, strategy := range opts.deviceNameStrategies.Value() {
_, err := nvcdi.NewDeviceNamer(strategy)
if err != nil {
return err
}
}

opts.nvidiaCTKPath = config.ResolveNVIDIACTKPath(m.logger, opts.nvidiaCTKPath)
Expand Down Expand Up @@ -241,9 +243,13 @@ func formatFromFilename(filename string) string {
}

func (m command) generateSpec(opts *options) (spec.Interface, error) {
deviceNamer, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy)
if err != nil {
return nil, fmt.Errorf("failed to create device namer: %v", err)
var deviceNamers []nvcdi.DeviceNamer
for _, strategy := range opts.deviceNameStrategies.Value() {
deviceNamer, err := nvcdi.NewDeviceNamer(strategy)
if err != nil {
return nil, fmt.Errorf("failed to create device namer: %v", err)
}
deviceNamers = append(deviceNamers, deviceNamer)
}

cdilib, err := nvcdi.New(
Expand All @@ -252,7 +258,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
nvcdi.WithDevRoot(opts.devRoot),
nvcdi.WithNVIDIACTKPath(opts.nvidiaCTKPath),
nvcdi.WithLdconfigPath(opts.ldconfigPath),
nvcdi.WithDeviceNamer(deviceNamer),
nvcdi.WithDeviceNamers(deviceNamers...),
nvcdi.WithMode(opts.mode),
nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()),
nvcdi.WithCSVFiles(opts.csv.files.Value()),
Expand Down
4 changes: 2 additions & 2 deletions pkg/nvcdi/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ type Interface interface {
GetCommonEdits() (*cdi.ContainerEdits, error)
GetAllDeviceSpecs() ([]specs.Device, error)
GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error)
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error)
GetDeviceSpecsByID(...string) ([]specs.Device, error)
}
17 changes: 10 additions & 7 deletions pkg/nvcdi/full-gpu-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,26 @@ import (
)

// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
edits, err := l.GetGPUDeviceEdits(d)
if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err)
}

name, err := l.deviceNamer.GetDeviceName(i, convert{d})
var deviceSpecs []specs.Device
names, err := l.deviceNamers.GetDeviceNames(i, convert{d})
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}

spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
for _, name := range names {
spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, spec)
}

return &spec, nil
return deviceSpecs, nil
}

// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
Expand Down
4 changes: 2 additions & 2 deletions pkg/nvcdi/gds.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func (l *gdslib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
}

// GetGPUDeviceSpecs is unsupported for the gdslib specs
func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}

Expand All @@ -78,7 +78,7 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
}

// GetMIGDeviceSpecs is unsupported for the gdslib specs
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

Expand Down
20 changes: 12 additions & 8 deletions pkg/nvcdi/lib-csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,20 @@ func (l *csvlib) GetAllDeviceSpecs() ([]specs.Device, error) {
return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err)
}

name, err := l.deviceNamer.GetDeviceName(0, uuidUnsupported{})
names, err := l.deviceNamers.GetDeviceNames(0, uuidIgnored{})
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}

deviceSpec := specs.Device{
Name: name,
ContainerEdits: *e.ContainerEdits,
var deviceSpecs []specs.Device
for _, name := range names {
deviceSpec := specs.Device{
Name: name,
ContainerEdits: *e.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, deviceSpec)
}
return []specs.Device{deviceSpec}, nil

return deviceSpecs, nil
}

// GetCommonEdits generates a CDI specification that can be used for ANY devices
Expand All @@ -82,7 +86,7 @@ func (l *csvlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
}

// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported for CSV files")
}

Expand All @@ -92,7 +96,7 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
}

// GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'.
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/nvcdi/lib-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,11 @@ func (l *nvmllib) getEditsForMIGDevice(nvmlDevice nvml.Device) (*cdi.ContainerEd
func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
deviceSpec, err := l.GetGPUDeviceSpecs(i, d)
specsForDevice, err := l.GetGPUDeviceSpecs(i, d)
if err != nil {
return err
}
deviceSpecs = append(deviceSpecs, *deviceSpec)
deviceSpecs = append(deviceSpecs, specsForDevice...)

return nil
})
Expand All @@ -225,11 +225,11 @@ func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
func (l *nvmllib) getMigDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error {
deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig)
specsForDevice, err := l.GetMIGDeviceSpecs(i, d, j, mig)
if err != nil {
return err
}
deviceSpecs = append(deviceSpecs, *deviceSpec)
deviceSpecs = append(deviceSpecs, specsForDevice...)

return nil
})
Expand Down
4 changes: 2 additions & 2 deletions pkg/nvcdi/lib-wsl.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func (l *wsllib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
}

// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported on WSL")
}

Expand All @@ -78,7 +78,7 @@ func (l *wsllib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
}

// GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'.
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL")
}

Expand Down
7 changes: 4 additions & 3 deletions pkg/nvcdi/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ type nvcdilib struct {
nvmllib nvml.Interface
mode string
devicelib device.Interface
deviceNamer DeviceNamer
deviceNamers DeviceNamers
driverRoot string
devRoot string
nvidiaCTKPath string
Expand Down Expand Up @@ -75,8 +75,9 @@ func New(opts ...Option) (Interface, error) {
if l.logger == nil {
l.logger = logger.New()
}
if l.deviceNamer == nil {
l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex)
if len(l.deviceNamers) == 0 {
indexNamer, _ := NewDeviceNamer(DeviceNameStrategyIndex)
l.deviceNamers = []DeviceNamer{indexNamer}
}
if l.driverRoot == "" {
l.driverRoot = "/"
Expand Down
4 changes: 2 additions & 2 deletions pkg/nvcdi/management.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, e
}

// GetGPUDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}

Expand All @@ -185,7 +185,7 @@ func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi
}

// GetMIGDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

Expand Down
18 changes: 10 additions & 8 deletions pkg/nvcdi/mig-device-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,25 @@ import (
)

// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) {
func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) ([]specs.Device, error) {
edits, err := l.GetMIGDeviceEdits(d, mig)
if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err)
}

name, err := l.deviceNamer.GetMigDeviceName(i, convert{d}, j, convert{mig})
names, err := l.deviceNamers.GetMigDeviceNames(i, convert{d}, j, convert{mig})
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}

spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
var deviceSpecs []specs.Device
for _, name := range names {
spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, spec)
}

return &spec, nil
return deviceSpecs, nil
}

// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
Expand Down
4 changes: 2 additions & 2 deletions pkg/nvcdi/mofed.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func (l *mofedlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
}

// GetGPUDeviceSpecs is unsupported for the mofedlib specs
func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}

Expand All @@ -78,7 +78,7 @@ func (l *mofedlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Cont
}

// GetMIGDeviceSpecs is unsupported for the mofedlib specs
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

Expand Down
Loading

0 comments on commit 355997d

Please sign in to comment.