Skip to content

Commit

Permalink
Merge pull request NVIDIA#242 from klueska/add-imex-support
Browse files Browse the repository at this point in the history
Add imex support
  • Loading branch information
klueska authored and elezar committed Feb 27, 2024
1 parent 7abf2e0 commit 0ab7e75
Show file tree
Hide file tree
Showing 19 changed files with 226 additions and 20 deletions.
13 changes: 8 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ libnvidia-container.so*
deps
src/build.h

src/driver_clt.c
src/driver_rpc.h
src/driver_svc.c
src/driver_xdr.c
src/nvc.h
src/nvc_clt.c
src/nvc_rpc.h
src/nvc_svc.c
src/nvc_xdr.c
src/nvcgo/libnvidia-container-go.h

/dist
/.vscode
/.vscode
2 changes: 1 addition & 1 deletion mk/nvidia-modprobe.mk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include $(MAKE_DIR)/common.mk

##### Source definitions #####

VERSION := 495.44
VERSION := 550.54.14
PREFIX := nvidia-modprobe-$(VERSION)
URL := https://github.com/NVIDIA/nvidia-modprobe/archive/$(VERSION).tar.gz

Expand Down
6 changes: 4 additions & 2 deletions mk/nvidia-modprobe.patch
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
diff -ruN nvidia-modprobe-495.44/modprobe-utils/nvidia-modprobe-utils.c nvidia-modprobe-495.44-patched/modprobe-utils/nvidia-modprobe-utils.c
--- nvidia-modprobe-495.44/modprobe-utils/nvidia-modprobe-utils.c 2021-11-13 14:36:58.096684602 +0000
+++ nvidia-modprobe-495.44-patched/modprobe-utils/nvidia-modprobe-utils.c 2021-11-13 14:43:40.965146390 +0000
@@ -888,10 +888,10 @@
@@ -959,10 +959,10 @@
return mknod_helper(major, minor_num, vgpu_dev_name, NV_PROC_REGISTRY_PATH);
}

Expand All @@ -19,11 +19,13 @@ diff -ruN nvidia-modprobe-495.44/modprobe-utils/nvidia-modprobe-utils.c nvidia-m
diff -ruN nvidia-modprobe-495.44/modprobe-utils/nvidia-modprobe-utils.h nvidia-modprobe-495.44-patched/modprobe-utils/nvidia-modprobe-utils.h
--- nvidia-modprobe-495.44/modprobe-utils/nvidia-modprobe-utils.h 2021-11-13 14:36:58.096684602 +0000
+++ nvidia-modprobe-495.44-patched/modprobe-utils/nvidia-modprobe-utils.h 2021-11-13 14:38:34.078700961 +0000
@@ -81,6 +81,7 @@
@@ -87,6 +87,7 @@
int nvidia_nvswitch_get_file_state(int minor);
int nvidia_cap_mknod(const char* cap_file_path, int *minor);
int nvidia_cap_get_file_state(const char* cap_file_path);
+int nvidia_cap_get_device_file_attrs(const char* cap_file_path, int *major, int *minor, char *name);
int nvidia_cap_imex_channel_mknod(int minor);
int nvidia_cap_imex_channel_file_state(int minor);
int nvidia_get_chardev_major(const char *name);
int nvidia_msr_modprobe(void);

1 change: 1 addition & 0 deletions pkg/deb/libnvidia-container@[email protected]
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ libnvidia-container.so.1 libnvidia-container1 #MINVER#
nvc_mig_config_global_caps_mount@NVC_1.0 @VERSION_TAG@
nvc_mig_monitor_global_caps_mount@NVC_1.0 @VERSION_TAG@
nvc_device_mig_caps_mount@NVC_1.0 @VERSION_TAG@
nvc_imex_channel_mount@NVC_1.0 @VERSION_TAG@
nvc_driver_info_free@NVC_1.0 @VERSION_TAG@
nvc_driver_info_new@NVC_1.0 @VERSION_TAG@
nvc_driver_mount@NVC_1.0 @VERSION_TAG@
Expand Down
6 changes: 6 additions & 0 deletions src/cli/cli.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct context {
char *devices;
char *mig_config;
char *mig_monitor;
char *imex_channels;
};

bool matches_pci_format(const char *gpu, char *buf, size_t bufsize);
Expand All @@ -74,6 +75,11 @@ void free_devices(struct devices *d);
int print_nvcaps_device_from_proc_file(struct nvc_context *, const char*, const char*);
int print_all_mig_minor_devices(const struct nvc_device_node *);

int parse_imex_info(
struct error *err,
char *chans,
struct nvc_imex_info *imex);

int select_devices(
struct error *err,
char *devs,
Expand Down
53 changes: 53 additions & 0 deletions src/cli/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,59 @@ select_mig_monitor_devices(
return (-1);
}

int
parse_imex_info(
struct error *err,
char *chans,
struct nvc_imex_info *imex)
{
// Initialize local variables.
char sep[2] = ",";
struct error ierr = {0};
char *chan = NULL;
char *ptr;
size_t id;
size_t max_chans = str_count_tokens(chans, ',');

/* Clear the imex struct */
memset(imex, 0, sizeof(*imex));

/* Short circuit if max_chans == 0 */
if (max_chans == 0)
return (0);

/* Allocate space for all IMEX channels */
if ((imex->chans = xcalloc(err, max_chans, sizeof(*imex->chans))) == NULL)
return (-1);

// Walk through the comma separated chans string and populate
// 'imex->chans' from it.
while ((chan = strsep(&chans, sep)) != NULL) {
// Allow extra commas between device strings.
if (*chan == '\0')
continue;

// Get the IMEX channel ID.
// Channel IDs must fit in the minor number of a dev_t (so within 20 bits).
id = strtoumax(chan, &ptr, 10);
if (!(*ptr == '\0' && id < (1 << 20))) {
error_setx(&ierr, "unsupported IMEX channel value: %s", chan);
goto fail;
}

// Add the IMEX channel to the selected list.
imex->chans[imex->nchans].id = (int)id;
imex->nchans++;
}

return (0);

fail:
error_setx(err, "%s", ierr.msg);
error_reset(&ierr);
return (-1);
}

int
new_devices(struct error *err, const struct nvc_device_info *dev, struct devices *d)
{
Expand Down
26 changes: 21 additions & 5 deletions src/cli/configure.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ const struct argp configure_usage = {
{"compat32", 0x80, NULL, 0, "Enable 32bits compatibility", -1},
{"mig-config", 0x81, "ID", 0, "Enable configuration of MIG devices", -1},
{"mig-monitor", 0x82, "ID", 0, "Enable monitoring of MIG devices", -1},
{"no-cgroups", 0x83, NULL, 0, "Don't use cgroup enforcement", -1},
{"no-devbind", 0x84, NULL, 0, "Don't bind mount devices", -1},
{"imex-channel", 0x83, "CHANNEL", 0, "IMEX channel ID(s) to inject", -1},
{"no-cgroups", 0x84, NULL, 0, "Don't use cgroup enforcement", -1},
{"no-devbind", 0x85, NULL, 0, "Don't bind mount devices", -1},
{0},
},
configure_parser,
Expand Down Expand Up @@ -138,10 +139,14 @@ configure_parser(int key, char *arg, struct argp_state *state)
goto fatal;
break;
case 0x83:
if (str_join(&err, &ctx->container_flags, "no-cgroups", " ") < 0)
if (str_join(&err, &ctx->imex_channels, arg, ",") < 0)
goto fatal;
break;
case 0x84:
if (str_join(&err, &ctx->container_flags, "no-cgroups", " ") < 0)
goto fatal;
break;
case 0x85:
if (str_join(&err, &ctx->container_flags, "no-devbind", " ") < 0)
goto fatal;
break;
Expand Down Expand Up @@ -262,6 +267,10 @@ configure_command(const struct context *ctx)
nvc_cfg->gid = ctx->gid;
nvc_cfg->root = ctx->root;
nvc_cfg->ldcache = ctx->ldcache;
if (parse_imex_info(&err, ctx->imex_channels, &nvc_cfg->imex) < 0) {
warnx("error parsing IMEX info: %s", err.msg);
goto fail;
}
if (libnvc.init(nvc, nvc_cfg, ctx->init_flags) < 0) {
warnx("initialization error: %s", libnvc.error(nvc));
goto fail;
Expand Down Expand Up @@ -319,7 +328,7 @@ configure_command(const struct context *ctx)
goto fail;
}

/* Select the devices available for MIG monitor among the visible . */
/* Select the devices available for MIG monitor among the visible devices. */
if (select_mig_monitor_devices(&err, ctx->mig_monitor, &devices, &mig_monitor_devices) < 0) {
warnx("mig-monitor error: %s", err.msg);
goto fail;
Expand Down Expand Up @@ -359,7 +368,7 @@ configure_command(const struct context *ctx)
}
}

/* Mount the driver, visible devices, mig-configs and mig-monitors. */
/* Mount the driver, visible devices, mig-configs, mig-monitors, and imex-channels. */
if (perm_set_capabilities(&err, CAP_EFFECTIVE, ecaps[NVC_MOUNT], ecaps_size(NVC_MOUNT)) < 0) {
warnx("permission error: %s", err.msg);
goto fail;
Expand Down Expand Up @@ -406,6 +415,12 @@ configure_command(const struct context *ctx)
}
}
}
for (size_t i = 0; i < nvc_cfg->imex.nchans; ++i) {
if (libnvc.imex_channel_mount(nvc, cnt, &nvc_cfg->imex.chans[i]) < 0) {
warnx("mount error: %s", libnvc.error(nvc));
goto fail;
}
}

/* Update the container ldcache. */
if (perm_set_capabilities(&err, CAP_EFFECTIVE, ecaps[NVC_LDCACHE], ecaps_size(NVC_LDCACHE)) < 0) {
Expand All @@ -424,6 +439,7 @@ configure_command(const struct context *ctx)
rv = EXIT_SUCCESS;

fail:
free(nvc_cfg->imex.chans);
free_devices(&devices);
libnvc.shutdown(nvc);
libnvc.container_free(cnt);
Expand Down
1 change: 1 addition & 0 deletions src/cli/libnvc.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ load_libnvc_v1(void)
load_libnvc_func(mig_config_global_caps_mount);
load_libnvc_func(mig_monitor_global_caps_mount);
load_libnvc_func(device_mig_caps_mount);
load_libnvc_func(imex_channel_mount);

return (0);
}
Expand Down
1 change: 1 addition & 0 deletions src/cli/libnvc.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct libnvc {
libnvc_entry(mig_config_global_caps_mount);
libnvc_entry(mig_monitor_global_caps_mount);
libnvc_entry(device_mig_caps_mount);
libnvc_entry(imex_channel_mount);
};

int load_libnvc(void);
Expand Down
19 changes: 18 additions & 1 deletion src/cli/list.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const struct argp list_usage = {
{"compat32", 0x80, NULL, 0, "Enable 32bits compatibility", -1},
{"mig-config", 0x81, "ID", 0, "MIG devices to list config capabilities files for", -1},
{"mig-monitor", 0x82, "ID", 0, "MIG devices to list monitor capabilities files for", -1},
{"imex-channel", 0x83, "CHANNEL", 0, "IMEX channel ID(s) to inject", -1},
{0},
},
list_parser,
Expand Down Expand Up @@ -65,8 +66,12 @@ list_parser(int key, char *arg, struct argp_state *state)
if (str_join(&err, &ctx->mig_monitor, arg, ",") < 0)
goto fatal;
break;
case 0x83:
if (str_join(&err, &ctx->imex_channels, arg, ",") < 0)
goto fatal;
break;
case ARGP_KEY_END:
if (state->argc == 1) {
if (state->argc == 1 || (state->argc == 2 && ctx->imex_channels != NULL)) {
if ((ctx->devices = xstrdup(&err, "all")) == NULL)
goto fatal;
ctx->mig_config = NULL;
Expand Down Expand Up @@ -127,6 +132,10 @@ list_command(const struct context *ctx)
nvc_cfg->gid = (!run_as_root && ctx->gid == (gid_t)-1) ? getegid() : ctx->gid;
nvc_cfg->root = ctx->root;
nvc_cfg->ldcache = ctx->ldcache;
if (parse_imex_info(&err, ctx->imex_channels, &nvc_cfg->imex) < 0) {
warnx("error parsing IMEX info: %s", err.msg);
goto fail;
}
if (libnvc.init(nvc, nvc_cfg, ctx->init_flags) < 0) {
warnx("initialization error: %s", libnvc.error(nvc));
goto fail;
Expand Down Expand Up @@ -203,6 +212,13 @@ list_command(const struct context *ctx)
}
}

/* List the IMEX channel devices. */
if (ctx->imex_channels != NULL) {
for (size_t i = 0; i < nvc_cfg->imex.nchans; ++i) {
printf(NV_CAPS_IMEX_DEVICE_PATH"\n", nvc_cfg->imex.chans[i].id);
}
}

/* List the files required for MIG configuration of the visible devices */
if (mig_config_devices.all && mig_config_devices.ngpus) {
printf("%s/%s\n", NV_MIG_CAPS_PATH, NV_MIG_CONFIG_FILE);
Expand Down Expand Up @@ -258,6 +274,7 @@ list_command(const struct context *ctx)
}
rv = EXIT_SUCCESS;
fail:
free(nvc_cfg->imex.chans);
free_devices(&devices);
libnvc.shutdown(nvc);
libnvc.device_info_free(dev);
Expand Down
3 changes: 3 additions & 0 deletions src/cli/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,5 +146,8 @@ main(int argc, char *argv[])
free(ctx.devices);
free(ctx.init_flags);
free(ctx.container_flags);
free(ctx.mig_config);
free(ctx.mig_monitor);
free(ctx.imex_channels);
return (rv);
}
1 change: 1 addition & 0 deletions src/libnvidia-container.lds
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ VERSION {
nvc_mig_config_global_caps_mount;
nvc_mig_monitor_global_caps_mount;
nvc_device_mig_caps_mount;
nvc_imex_channel_mount;

__ubsan_default_options;
local:
Expand Down
27 changes: 23 additions & 4 deletions src/nvc.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
#include "xfuncs.h"

static int init_within_userns(struct error *);
static int load_kernel_modules(struct error *, const char *);
static int load_kernel_modules(struct error *, const char *, const struct nvc_imex_info *);
static int copy_config(struct error *, struct nvc_context *, const struct nvc_config *);

const char interpreter[] __attribute__((section(".interp"))) = LIB_DIR "/" LD_SO;
Expand Down Expand Up @@ -229,7 +229,7 @@ mig_nvcaps_mknodes(struct error *err, int num_gpus) {
}

static int
load_kernel_modules(struct error *err, const char *root)
load_kernel_modules(struct error *err, const char *root, const struct nvc_imex_info *imex)
{
int userns;
pid_t pid;
Expand Down Expand Up @@ -290,6 +290,11 @@ load_kernel_modules(struct error *err, const char *root)
log_info("running mknod for all nvcaps in " NV_CAPS_DEVICE_DIR);
if (mig_nvcaps_mknodes(err, devs.num_matches) < 0)
log_errf("could not create kernel module device nodes: %s", err->msg);
for (int i = 0; i < (int)imex->nchans; ++i) {
log_infof("running mknod for " NV_CAPS_IMEX_DEVICE_PATH, imex->chans[i].id);
if (nvidia_cap_imex_channel_mknod(imex->chans[i].id) == 0)
log_errf("could not mknod for IMEX channel %d", imex->chans[i].id);
}
error_reset(err);
}

Expand Down Expand Up @@ -347,9 +352,21 @@ copy_config(struct error *err, struct nvc_context *ctx, const struct nvc_config
ctx->cfg.gid = (gid_t)gid;
}

if (cfg->imex.nchans > 0) {
if ((ctx->cfg.imex.chans = xcalloc(err, cfg->imex.nchans, sizeof(*ctx->cfg.imex.chans))) == NULL)
return (-1);
}
for (size_t i = 0; i < cfg->imex.nchans; ++i) {
ctx->cfg.imex.chans[i] = cfg->imex.chans[i];
}
ctx->cfg.imex.nchans = cfg->imex.nchans;

log_infof("using root %s", ctx->cfg.root);
log_infof("using ldcache %s", ctx->cfg.ldcache);
log_infof("using unprivileged user %"PRIu32":%"PRIu32, (uint32_t)ctx->cfg.uid, (uint32_t)ctx->cfg.gid);
for (size_t i = 0; i < ctx->cfg.imex.nchans; ++i) {
log_infof("using IMEX channel %d", ctx->cfg.imex.chans[i].id);
}
return (0);
}

Expand All @@ -364,7 +381,7 @@ nvc_init(struct nvc_context *ctx, const struct nvc_config *cfg, const char *opts
if (ctx->initialized)
return (0);
if (cfg == NULL)
cfg = &(struct nvc_config){NULL, NULL, (uid_t)-1, (gid_t)-1};
cfg = &(struct nvc_config){NULL, NULL, (uid_t)-1, (gid_t)-1, {0}};
if (validate_args(ctx, !str_empty(cfg->ldcache) && !str_empty(cfg->root)) < 0)
return (-1);
if (opts == NULL)
Expand Down Expand Up @@ -403,7 +420,7 @@ nvc_init(struct nvc_context *ctx, const struct nvc_config *cfg, const char *opts
if (flags & OPT_LOAD_KMODS) {
if (ctx->dxcore.initialized)
log_warn("skipping kernel modules load on WSL");
else if (load_kernel_modules(&ctx->err, ctx->cfg.root) < 0)
else if (load_kernel_modules(&ctx->err, ctx->cfg.root, &ctx->cfg.imex) < 0)
goto fail;
}

Expand All @@ -421,6 +438,7 @@ nvc_init(struct nvc_context *ctx, const struct nvc_config *cfg, const char *opts
fail:
free(ctx->cfg.root);
free(ctx->cfg.ldcache);
free(ctx->cfg.imex.chans);
xclose(ctx->mnt_ns);
return (-1);
}
Expand Down Expand Up @@ -453,6 +471,7 @@ nvc_shutdown(struct nvc_context *ctx)

free(ctx->cfg.root);
free(ctx->cfg.ldcache);
free(ctx->cfg.imex.chans);
xclose(ctx->mnt_ns);

memset(&ctx->cfg, 0, sizeof(ctx->cfg));
Expand Down
Loading

0 comments on commit 0ab7e75

Please sign in to comment.