Skip to content

Commit

Permalink
Merge pull request #289 from etiennedub/sharding_doc
Browse files Browse the repository at this point in the history
Sharding GPU support
  • Loading branch information
cmd-ntrf authored Nov 4, 2024
2 parents aa62d4b + c56851f commit 3b5212a
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 16 deletions.
9 changes: 5 additions & 4 deletions aws/infrastructure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,11 @@ locals {
prefix = values.prefix
tags = values.tags
specs = {
cpus = data.aws_ec2_instance_type.instance_type[values.prefix].default_vcpus
ram = data.aws_ec2_instance_type.instance_type[values.prefix].memory_size
gpus = try(one(data.aws_ec2_instance_type.instance_type[values.prefix].gpus).count, 0)
mig = lookup(values, "mig", null)
cpus = data.aws_ec2_instance_type.instance_type[values.prefix].default_vcpus
ram = data.aws_ec2_instance_type.instance_type[values.prefix].memory_size
gpus = try(one(data.aws_ec2_instance_type.instance_type[values.prefix].gpus).count, 0)
mig = lookup(values, "mig", null)
shard = lookup(values, "shard", null)
}
volumes = contains(keys(module.design.volume_per_instance), x) ? {
for pv_key, pv_values in var.volumes:
Expand Down
9 changes: 5 additions & 4 deletions azure/infrastructure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,11 @@ locals {
prefix = values.prefix
tags = values.tags
specs = {
cpus = local.vmsizes[values.type].vcpus
ram = local.vmsizes[values.type].ram
gpus = local.vmsizes[values.type].gpus
mig = lookup(values, "mig", null)
cpus = local.vmsizes[values.type].vcpus
ram = local.vmsizes[values.type].ram
gpus = local.vmsizes[values.type].gpus
mig = lookup(values, "mig", null)
shard = lookup(values, "shard", null)
}
volumes = contains(keys(module.design.volume_per_instance), x) ? {
for pv_key, pv_values in var.volumes:
Expand Down
1 change: 1 addition & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,7 @@ the operating system and service software
```
This is only functional with [MIG supported GPUs](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#supported-gpus),
and with x86-64 processors (see [NVIDIA/mig-parted issue #30](https://github.com/NVIDIA/mig-parted/issues/30)).
6. `shard`: total number of [Sharding](https://slurm.schedmd.com/gres.html#Sharding) on the node. Sharding allows sharing the same GPU on multiple jobs. The total number of shards is evenly distributed across all GPUs on the node.
For some cloud providers, it possible to define additional attributes.
The following sections present the available attributes per provider.
Expand Down
9 changes: 5 additions & 4 deletions gcp/infrastructure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,11 @@ locals {
prefix = values.prefix
tags = values.tags
specs = {
cpus = data.external.machine_type[values["prefix"]].result["vcpus"]
ram = data.external.machine_type[values["prefix"]].result["ram"]
gpus = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0))
mig = lookup(values, "mig", null)
cpus = data.external.machine_type[values["prefix"]].result["vcpus"]
ram = data.external.machine_type[values["prefix"]].result["ram"]
gpus = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0))
mig = lookup(values, "mig", null)
shard = lookup(values, "shard", null)
}
volumes = contains(keys(module.design.volume_per_instance), x) ? {
for pv_key, pv_values in var.volumes:
Expand Down
9 changes: 5 additions & 4 deletions openstack/infrastructure.tf
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,14 @@ locals {
prefix = values.prefix
tags = values.tags
specs = {
cpus = data.openstack_compute_flavor_v2.flavors[values.prefix].vcpus
ram = data.openstack_compute_flavor_v2.flavors[values.prefix].ram
gpus = sum([
cpus = data.openstack_compute_flavor_v2.flavors[values.prefix].vcpus
ram = data.openstack_compute_flavor_v2.flavors[values.prefix].ram
gpus = sum([
parseint(lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "resources:VGPU", "0"), 10),
parseint(split(":", lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "pci_passthrough:alias", "gpu:0"))[1], 10)
])
mig = lookup(values, "mig", null)
mig = lookup(values, "mig", null)
shard = lookup(values, "shard", null)
}
volumes = contains(keys(module.design.volume_per_instance), x) ? {
for pv_key, pv_values in var.volumes:
Expand Down

0 comments on commit 3b5212a

Please sign in to comment.