Skip to content

Commit

Permalink
Merge branch 'main' into feat/ray-cluster-head-scheduling
Browse files Browse the repository at this point in the history
  • Loading branch information
kukushking authored Sep 23, 2024
2 parents fe0b1af + dc04776 commit 541a731
Show file tree
Hide file tree
Showing 10 changed files with 635 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## UNRELEASED

### **Added**
- added new manifest `manifests/fine-tuning-6B`

### **Changed**

Expand Down
26 changes: 26 additions & 0 deletions manifests/fine-tuning-6B/base-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: networking
path: git::https://github.com/awslabs/idf-modules.git//modules/network/basic-cdk?ref=release/1.11.0&depth=1
parameters:
- name: InternetAccessible
value: true
---
name: buckets
path: git::https://github.com/awslabs/idf-modules.git//modules/storage/buckets?ref=release/1.11.0&depth=1
parameters:
- name: EncryptionType
value: SSE
- name: RetentionType
value: DESTROY
---
name: ray-ecr
path: git::https://github.com/awslabs/idf-modules.git//modules/storage/ecr?ref=release/1.11.0&depth=1
targetAccount: primary
parameters:
- name: ImageTagMutability
value: MUTABLE
- name: ImageScanOnPush
value: True
- name: Encryption
value: KMS_MANAGED
- name: RemovalPolicy
value: DESTROY
112 changes: 112 additions & 0 deletions manifests/fine-tuning-6B/core-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
name: eks
path: git::https://github.com/awslabs/idf-modules.git//modules/compute/eks?ref=release/1.11.0&depth=1
dataFiles:
- filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/1.29.yaml?ref=release/1.11.0&depth=1
- filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/default.yaml?ref=release/1.11.0&depth=1
parameters:
- name: VpcId
valueFrom:
moduleMetadata:
group: base
name: networking
key: VpcId
- name: ControlplaneSubnetIds
valueFrom:
moduleMetadata:
group: base
name: networking
key: PrivateSubnetIds
- name: DataplaneSubnetIds
valueFrom:
moduleMetadata:
group: base
name: networking
key: PrivateSubnetIds
- name: EksAdminRoleName
value: Admin
- name: EksPoweruserRoleName
value: PowerUser
- name: EksReadOnlyRoleName
value: ReadOnly
- name: EksVersion
value: "1.29"
# valueFrom:
# envVariable: GLOBAL_EKS_VERSION
- name: EksCompute
value:
eks_nodegroup_config:
- eks_ng_name: ng1
eks_node_quantity: 1
eks_node_max_quantity: 1
eks_node_min_quantity: 1
eks_node_disk_size: 400
eks_node_instance_type: "m5.xlarge"
eks_node_labels:
usage: core
- eks_ng_name: ng-gpu
eks_node_quantity: 6
eks_node_max_quantity: 15
eks_node_min_quantity: 6
eks_node_disk_size: 400
eks_node_instance_type: "g4dn.4xlarge"
eks_node_labels:
usage: gpu
nvidia.com/gpu.present: "true"
use_gpu_ami: True
eks_node_taints:
- key: "nvidia.com/gpu"
value: "true"
# operator: "Equal"
effect: "NoSchedule"
install_nvidia_device_plugin: True
eks_node_spot: False
eks_secrets_envelope_encryption: True
eks_api_endpoint_private: False
- name: EksAddons
value:
# Autoscaling
deploy_cluster_autoscaler: True
deploy_metrics_server: True
# Observability
deploy_cloudwatch_observability_addon: True
# Storage
deploy_aws_fsx_csi: True
---
name: fsx-lustre
path: git::https://github.com/awslabs/idf-modules.git//modules/storage/fsx-lustre?ref=release/1.11.0&depth=1
parameters:
- name: VpcId
valueFrom:
moduleMetadata:
group: base
name: networking
key: VpcId
- name: PrivateSubnetIds
valueFrom:
moduleMetadata:
group: base
name: networking
key: PrivateSubnetIds
- name: FsDeploymentType
value: SCRATCH_2
- name: StorageThroughput
value: 50
- name: DataBucketName
valueFrom:
moduleMetadata:
group: base
name: buckets
key: ArtifactsBucketName
- name: DraExportPath
valueFrom:
parameterValue: draExportPath
- name: DraImportPath
valueFrom:
parameterValue: draImportPath
- name: FsxVersion
value: "2.15"
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: ImportPolicy
value: "NEW_CHANGED_DELETED"
30 changes: 30 additions & 0 deletions manifests/fine-tuning-6B/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: fine-tuning-6B
forceDependencyRedeploy: True
toolchainRegion: us-east-1
groups:
- name: base
path: manifests/fine-tuning-6B/base-modules.yaml
- name: images
path: manifests/fine-tuning-6B/images-modules.yaml
- name: core
path: manifests/fine-tuning-6B/core-modules.yaml
- name: integration
path: manifests/fine-tuning-6B/integration-modules.yaml
- name: ray-operator
path: manifests/fine-tuning-6B/ray-operator-modules.yaml
- name: ray-cluster
path: manifests/fine-tuning-6B/ray-cluster-modules.yaml
targetAccountMappings:
- alias: primary
accountId:
valueFrom:
envVariable: PRIMARY_ACCOUNT
default: true
codebuildImage: aws/codebuild/standard:7.0
parametersGlobal:
rayNamespaceName: ray
draImportPath: /ray/import/
draExportPath: /ray/export/
regionMappings:
- region: us-east-1
default: true
10 changes: 10 additions & 0 deletions manifests/fine-tuning-6B/images-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: ray
path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-image?ref=release/1.5.0&depth=1
targetAccount: primary
parameters:
- name: EcrRepoName
valueFrom:
moduleMetadata:
group: base
name: ray-ecr
key: EcrRepositoryName
63 changes: 63 additions & 0 deletions manifests/fine-tuning-6B/integration-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: lustre-on-eks
path: git::https://github.com/awslabs/idf-modules.git//modules/integration/fsx-lustre-on-eks?ref=release/1.11.0&depth=1
parameters:
- name: EksClusterAdminRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterMasterRoleArn
- name: EksHandlerRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksHandlerRoleArn
- name: EksClusterName
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterName
- name: EksOidcArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksOidcArn
- name: EksClusterSecurityGroupId
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterSecurityGroupId
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: FsxFileSystemId
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreFileSystemId
- name: FsxSecurityGroupId
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreSecurityGroup
- name: FsxMountName
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreMountName
- name: FsxDnsName
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreAttrDnsName
- name: DraExportPath
valueFrom:
parameterValue: draExportPath
82 changes: 82 additions & 0 deletions manifests/fine-tuning-6B/ray-cluster-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
name: ray-cluster
path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-cluster?ref=release/1.5.0&depth=1
parameters:
- name: EksClusterAdminRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterMasterRoleArn
- name: EksClusterName
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterName
- name: EksOidcArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksOidcArn
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: ServiceAccountName
valueFrom:
moduleMetadata:
group: ray-operator
name: ray-operator
key: EksServiceAccountName
- name: HeadResources
value:
requests:
cpu: "1"
memory: "8G"
limits:
cpu: "4"
memory: "16G"
- name: WorkerReplicas
value: 1
- name: WorkerMinReplicas
value: 1
- name: WorkerMaxReplicas
value: 15
- name: WorkerResources
value:
requests:
cpu: "4"
memory: "8G"
limits:
cpu: "14"
memory: "60G"
- name: DataBucketName
valueFrom:
moduleMetadata:
group: base
name: buckets
key: ArtifactsBucketName
- name: ImageUri
valueFrom:
moduleMetadata:
group: images
name: ray
key: ImageUri
- name: WorkerTolerations
value: # make sure to match w/ the taints on the GPU Nodegroup
- key: "nvidia.com/gpu"
value: "true"
# operator: "Equal"
effect: "NoSchedule"
- name: WorkerLabels
value: # make sure to match w/ the labels on the GPU Nodegroup
usage: gpu
- name: PvcName
valueFrom:
moduleMetadata:
group: integration
name: lustre-on-eks
key: PersistentVolumeClaimName
- name: DraExportPath
valueFrom:
parameterValue: draExportPath
60 changes: 60 additions & 0 deletions manifests/fine-tuning-6B/ray-operator-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: ray-operator
path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-operator?ref=release/1.5.0&depth=1
parameters:
- name: EksClusterAdminRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterMasterRoleArn
- name: EksHandlerRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksHandlerRoleArn
- name: EksClusterName
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterName
- name: EksClusterEndpoint
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterEndpoint
- name: EksOidcArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksOidcArn
- name: EksOpenidIssuer
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterOpenIdConnectIssuer
- name: EksCertAuthData
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterCertAuthData
- name: EksClusterSecurityGroupId
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterSecurityGroupId
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: DataBucketName
valueFrom:
moduleMetadata:
group: base
name: buckets
key: ArtifactsBucketName
Loading

0 comments on commit 541a731

Please sign in to comment.