Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix accelerators not being considered when counting allocatables #2115

Merged
merged 1 commit into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion hack/generate-gpu-count-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# Generates gpu table for `pkg/cloud/volume_limits.go` from the AWS API
# Ensure you are opted into all opt-in regions before running
# Ensure your account isn't in any private instance type betas before running
# We are exluding the g5.48xlarge instance type as it is a special case that does not comply to regular ebs volume limit calculations

set -euo pipefail

Expand All @@ -34,4 +35,4 @@ function get_all_gpus() {
done
}

get_all_gpus | sort | uniq
get_all_gpus | sort | uniq | grep -v "g5.48xlarge"
3 changes: 2 additions & 1 deletion hack/generate-instance-store-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# Generates instance store table for `pkg/cloud/volume_limits.go` from the AWS API
# Ensure you are opted into all opt-in regions before running
# Ensure your account isn't in any private instance type betas before running
# We are exluding the g5.48xlarge instance type as it is a special case that does not comply to regular ebs volume limit calculations

set -euo pipefail

Expand All @@ -36,4 +37,4 @@ function get_all_instance_stores() {
done
}

get_all_instance_stores | sort | uniq
get_all_instance_stores | sort | uniq | grep -v "g5.48xlarge"
43 changes: 33 additions & 10 deletions pkg/cloud/volume_limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"strings"
)

// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
const (
highMemoryMetalInstancesMaxVolumes = 19
highMemoryVirtualInstancesMaxVolumes = 27
Expand Down Expand Up @@ -51,7 +51,7 @@ func init() {

var dedicatedVolumeLimits = map[string]int{}

// / List of nitro instance types can be found here: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
// List of nitro instance types can be found here: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
var nonNitroInstanceFamilies = map[string]struct{}{
"t2": {},
"c3": {},
Expand All @@ -67,6 +67,7 @@ var nonNitroInstanceFamilies = map[string]struct{}{
"g3": {},
"d2": {},
"h1": {},
"f1": {},
AndrewSirenko marked this conversation as resolved.
Show resolved Hide resolved
}

func IsNitroInstanceType(it string) bool {
Expand All @@ -88,8 +89,8 @@ func GetMaxAttachments(nitro bool) int {
return nonNitroMaxAttachments
}

// / Some instance types have a maximum limit of EBS volumes
// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html
// Some instance types have a maximum limit of EBS volumes
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html
var maxVolumeLimits = map[string]int{
"d3.8xlarge": 3,
"d3en.12xlarge": 3,
Expand Down Expand Up @@ -149,12 +150,16 @@ func GetReservedSlotsForInstanceType(it string) int {
if ok {
total += gpus
}
acceleratorSlots, ok := acceleratorSlotsTaken[it]
if ok {
total += acceleratorSlots
}
return total
}

// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html
// / IMDS does not provide NVMe instance store data; we'll just list all instances here
// / TODO: See if we can get these values from DescribeInstanceTypes API
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html
// IMDS does not provide NVMe instance store data; we'll just list all instances here
// g5.48xlarge is not added to this table as it is in the maxVolumeLimits
var nvmeInstanceStoreVolumes = map[string]int{
"c1.medium": 1,
"c1.xlarge": 4,
Expand Down Expand Up @@ -242,7 +247,6 @@ var nvmeInstanceStoreVolumes = map[string]int{
"g5.16xlarge": 1,
"g5.24xlarge": 1,
"g5.2xlarge": 1,
"g5.48xlarge": 2,
"g5.4xlarge": 1,
"g5.8xlarge": 1,
"g5.xlarge": 1,
Expand Down Expand Up @@ -497,7 +501,9 @@ var nvmeInstanceStoreVolumes = map[string]int{
"z1d.xlarge": 1,
}

// / https://aws.amazon.com/ec2/instance-types
// https://aws.amazon.com/ec2/instance-types
// Despite the dl1.24xlarge having Gaudi Accelerators describe instance types considers them GPUs as such that instacne type is in this table
// g5.48xlarge is not added to this table as it is in the maxVolumeLimits
var gpuInstanceGpus = map[string]int{
"dl1.24xlarge": 8,
"g3.16xlarge": 4,
Expand All @@ -520,7 +526,6 @@ var gpuInstanceGpus = map[string]int{
"g5.16xlarge": 1,
"g5.24xlarge": 4,
"g5.2xlarge": 1,
"g5.48xlarge": 8,
"g5.4xlarge": 1,
"g5.8xlarge": 1,
"g5g.16xlarge": 2,
Expand Down Expand Up @@ -551,3 +556,21 @@ var gpuInstanceGpus = map[string]int{
"p4de.24xlarge": 8,
"p5.48xlarge": 8,
}

// Note this table is not a reflection of how many accelerators an instance has but of how many slots their combined acclerators take up
// VT instance type accelerators take two slots each with the exception of the vt1.24xlarge which takes 0 slots for its accelerators
// inf1 instance types are purposely not added to this table as they are in the maxVolumeLimits table
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html
var acceleratorSlotsTaken = map[string]int{
ElijahQuinones marked this conversation as resolved.
Show resolved Hide resolved
"vt1.3xlarge": 2,
"vt1.6xlarge": 4,
"vt1.24xlarge": 0,
"dl2q.24xlarge": 8,
"inf2.xlarge": 1,
"inf2.8xlarge": 1,
"inf2.24xlarge": 6,
"inf2.48xlarge": 12,
"trn1.2xlarge": 1,
"trn1.32xlarge": 16,
"trn1n.32xlarge": 16,
}
12 changes: 8 additions & 4 deletions pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,8 +781,8 @@ func (d *NodeService) getVolumesLimit() int64 {
}

dedicatedLimit := cloud.GetDedicatedLimitForInstanceType(instanceType)
maxEBSAttachments, ok := cloud.GetEBSLimitForInstanceType(instanceType)
if ok {
maxEBSAttachments, hasMaxVolumeLimit := cloud.GetEBSLimitForInstanceType(instanceType)
if hasMaxVolumeLimit {
availableAttachments = min(maxEBSAttachments, availableAttachments)
}
// For special dedicated limit instance types, the limit is only for EBS volumes
Expand All @@ -791,8 +791,12 @@ func (d *NodeService) getVolumesLimit() int64 {
availableAttachments = dedicatedLimit
} else if isNitro {
enis := d.metadata.GetNumAttachedENIs()
nvmeInstanceStoreVolumes := cloud.GetReservedSlotsForInstanceType(instanceType)
availableAttachments = availableAttachments - enis - nvmeInstanceStoreVolumes
reservedSlots := cloud.GetReservedSlotsForInstanceType(instanceType)
if hasMaxVolumeLimit {
availableAttachments = availableAttachments - (enis - 1) - reservedSlots
} else {
availableAttachments = availableAttachments - enis - reservedSlots
}
}
availableAttachments = availableAttachments - reservedVolumeAttachments
if availableAttachments <= 0 {
Expand Down
121 changes: 102 additions & 19 deletions pkg/driver/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1136,23 +1136,87 @@ func TestGetVolumesLimit(t *testing.T) {
},
},
{
name: "inf1.24xlarge_volume_attach_limit",
name: "mac1.metal_volume_attach_limit",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 9,
expectedVal: 15,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("inf1.24xlarge")
m.EXPECT().GetInstanceType().Return("mac1.metal")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "u-12tb1.metal_volume_attach_limit",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 18,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("u-12tb1.metal")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "mac1.metal_volume_attach_limit",
name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4ad.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 21,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.12xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "dl1.24xlarge_volume_attach_limit (8 Accelerator slots , 4 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
Expand All @@ -1161,73 +1225,92 @@ func TestGetVolumesLimit(t *testing.T) {
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("mac1.metal")
m.EXPECT().GetInstanceType().Return("dl1.24xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "u-12tb1.metal_volume_attach_limit",
// will and should fail if g5.48xlarge instance type is in any table other than maxVolumeLimits table
ElijahQuinones marked this conversation as resolved.
Show resolved Hide resolved
name: "g5.48xlarge_volume_attach_limit (Instance has attached GPUs and NVMe Instance Store volumes but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 17,
expectedVal: 8,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("u-12tb1.metal")
m.EXPECT().GetInstanceType().Return("g5.48xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
// Should fail if inf1.xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
expectedVal: 25,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.xlarge")
m.EXPECT().GetInstanceType().Return("inf1.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
// 1 gpu
{
name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
// Should fail if inf1.xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.2xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
expectedVal: 25,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4ad.xlarge")
m.EXPECT().GetInstanceType().Return("inf1.2xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
// 4 gpus
{
name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)",
// Should fail if inf1.6xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.6xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 21,
expectedVal: 22,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.12xlarge")
m.EXPECT().GetInstanceType().Return("inf1.6xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
// Should fail if inf1.24xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.24xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 10,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("inf1.24xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
Expand Down
Loading