diff --git a/backend/builder/image_builder/image_builder.py b/backend/builder/image_builder/image_builder.py index 38810abac..ed25b48ed 100644 --- a/backend/builder/image_builder/image_builder.py +++ b/backend/builder/image_builder/image_builder.py @@ -23,6 +23,7 @@ from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name from substrapp.docker_registry import USER_IMAGE_REPOSITORY from substrapp.kubernetes_utils import delete_pod +from substrapp.kubernetes_utils import get_resources_requirements from substrapp.kubernetes_utils import get_security_context from substrapp.lock_local import lock_resource from substrapp.utils import timeit @@ -306,6 +307,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c args=args, volume_mounts=volume_mounts, security_context=container_security_context, + resources=get_resources_requirements(cpu_request="1000m", memory_request="4Gi", memory_limit="32Gi"), ) diff --git a/backend/substrapp/compute_tasks/compute_pod.py b/backend/substrapp/compute_tasks/compute_pod.py index bc12bea02..5e929104a 100644 --- a/backend/substrapp/compute_tasks/compute_pod.py +++ b/backend/substrapp/compute_tasks/compute_pod.py @@ -6,6 +6,7 @@ from substrapp.kubernetes_utils import delete_pod from substrapp.kubernetes_utils import get_pod_security_context +from substrapp.kubernetes_utils import get_resources_requirements from substrapp.kubernetes_utils import get_security_context NAMESPACE = settings.NAMESPACE @@ -112,6 +113,7 @@ def create_pod( args=None, volume_mounts=volume_mounts + gpu_volume_mounts, security_context=get_security_context(), + resources=get_resources_requirements(cpu_request="1000m", memory_request="1Gi", memory_limit="64Gi"), env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()], **container_optional_kwargs, ) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index 5bb6ac65f..1486e198a 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -47,6 +47,14 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) -> return security_context +def get_resources_requirements( + *, cpu_request: str = "1000m", memory_request: str = "200M", memory_limit: str = "2G" +) -> kubernetes.client.V1ResourceRequirements: + return kubernetes.client.V1ResourceRequirements( + requests={"cpu": cpu_request, "memory": memory_request}, limits={"memory": memory_limit} + ) + + def pod_exists_by_label_selector(k8s_client: kubernetes.client.CoreV1Api, label_selector: str) -> bool: """Return True if the pod exists, else False. diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index d19caef94..1cefe9e63 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -2,6 +2,14 @@ + +## [26.1.0] - 2024-04-17 + +### Added + +- Resources limits and requests (CPU and memory) for all containers. + + ## [26.0.0] - 2024-04-16 ### Changed diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index b369cba13..a6f1911c3 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.0.0 +version: 26.1.0 appVersion: 0.45.0 kubeVersion: ">= 1.19.0-0" description: Main package for Substra diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 70a7ce6da..cfaa2e35b 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -72,7 +72,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` | | `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` | | `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` | -| `server.resources` | Server container resources requests and limits | `{}` | +| `server.resources.requests.cpu` | Server container cpu request | `1000m` | +| `server.resources.requests.memory` | Server container memory request | `6Gi` | +| `server.resources.limits.cpu` | Server container cpu limit | `2000m` | +| `server.resources.limits.memory` | Server container memory limit | `12Gi` | | `server.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | | `server.persistence.servermedias.size` | Servermedias volume size | `10Gi` | | `server.persistence.servermedias.existingClaim` | use this PVC rather than creating a new one | `nil` | @@ -119,7 +122,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources` | Worker container resources requests and limits | `{}` | +| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` | +| `worker.resources.requests.memory` | Worker container memory request | `4Gi` | +| `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | +| `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | | `worker.nodeSelector` | Node labels for pod assignment | `{}` | | `worker.tolerations` | Toleration labels for pod assignment | `[]` | | `worker.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | @@ -163,7 +169,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` | | `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` | | `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` | -| `schedulerWorker.resources` | Scheduler container resources requests and limits | `{}` | +| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `250m` | +| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` | +| `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `schedulerWorker.podSecurityContext.enabled` | Enable security context | `true` | | `schedulerWorker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `schedulerWorker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | @@ -180,7 +189,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` | | `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` | | `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` | -| `scheduler.resources` | Scheduler container resources requests and limits | `{}` | +| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `250m` | +| `scheduler.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` | +| `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `scheduler.nodeSelector` | Node labels for pod assignment | `{}` | | `scheduler.tolerations` | Toleration labels for pod assignment | `[]` | | `scheduler.affinity` | Affinity settings for pod assignment | `{}` | @@ -194,8 +206,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | Name | Description | Value | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------- | | `builder.replicaCount` | Number of builder replicas | `1` | -| `builder.enabled` | Enable worker service | `true` | -| `builder.replicaCount` | Replica count for the worker service | `1` | +| `builder.enabled` | Enable builder service | `true` | +| `builder.replicaCount` | Replica count for the builder service | `1` | | `builder.concurrency` | Maximum amount of tasks to process in parallel | `1` | | `builder.image.registry` | Substra backend server image registry | `ghcr.io` | | `builder.image.repository` | Substra backend server image repository | `substra/substra-backend` | @@ -206,7 +218,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `builder.resources` | Builder container resources requests and limits | `{}` | +| `builder.resources.requests.cpu` | Builder container cpu request | `2000m` | +| `builder.resources.requests.memory` | Builder container memory request | `4Gi` | +| `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` | +| `builder.resources.limits.memory` | Builder container memory limit | `8Gi` | | `builder.nodeSelector` | Node labels for pod assignment | `{}` | | `builder.tolerations` | Toleration labels for pod assignment | `[]` | | `builder.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | diff --git a/charts/substra-backend/templates/deployment-api-events.yaml b/charts/substra-backend/templates/deployment-api-events.yaml index 8df3831d1..eb0b5df7e 100644 --- a/charts/substra-backend/templates/deployment-api-events.yaml +++ b/charts/substra-backend/templates/deployment-api-events.yaml @@ -41,6 +41,13 @@ spec: - name: api-event-app image: {{ include "substra-backend.images.name" (dict "img" .Values.api.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.api.events.image.pullPolicy }} + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index 2f1da5668..b52c58f5f 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -23,9 +23,21 @@ spec: initContainers: - name: wait-registry image: jwilder/dockerize:0.6.1 + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" command: ['dockerize', '-wait', 'tcp://{{ $.Release.Name }}-docker-registry:5000'] - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} + resources: + requests: + memory: "2Gi" + cpu: "1000m" + limits: + memory: "8Gi" args: - "--context=/docker-context" {{- if .dstImage }} @@ -48,6 +60,12 @@ spec: containers: - image: gcr.io/google-containers/pause:3.2 name: pause + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" volumes: - name: kaniko-dir emptyDir: {} diff --git a/charts/substra-backend/templates/deployment-worker-events.yaml b/charts/substra-backend/templates/deployment-worker-events.yaml index 83aef2ead..aa93b8fb4 100644 --- a/charts/substra-backend/templates/deployment-worker-events.yaml +++ b/charts/substra-backend/templates/deployment-worker-events.yaml @@ -41,6 +41,13 @@ spec: - name: worker-event-app image: {{ include "substra-backend.images.name" (dict "img" .Values.worker.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.worker.events.image.pullPolicy }} + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 7327d215c..5b776eca4 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -172,17 +172,19 @@ server: ## ingressClassName: - ## @param server.resources Server container resources requests and limits - ## e.g: - ## resources: - ## limits: - ## cpu: 100m - ## memory: 128Mi - ## requests: - ## cpu: 100m - ## memory: 128Mi + ## @param server.resources.requests.cpu Server container cpu request + ## @param server.resources.requests.memory Server container memory request + ## @param server.resources.limits.cpu Server container cpu limit + ## @param server.resources.limits.memory Server container memory limit ## - resources: {} + resources: + requests: + cpu: "1000m" + memory: "6Gi" + limits: + cpu: "2000m" + memory: "12Gi" + persistence: ## @param server.persistence.storageClass Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning @@ -304,9 +306,18 @@ worker: runAsUser: 1001 runAsGroup: 1001 fsGroup: 1001 - ## @param worker.resources Worker container resources requests and limits - ## - resources: {} + ## @param worker.resources.requests.cpu Worker container cpu request + ## @param worker.resources.requests.memory Worker container memory request + ## @param worker.resources.limits.cpu Worker container cpu limit + ## @param worker.resources.limits.memory Worker container memory limit + ## + resources: + requests: + cpu: "1000m" + memory: "4Gi" + limits: + cpu: "2000m" + memory: "8Gi" ## @param worker.nodeSelector Node labels for pod assignment ## nodeSelector: {} @@ -422,9 +433,18 @@ schedulerWorker: ## @param schedulerWorker.affinity Affinity settings for pod assignment ## affinity: {} - ## @param schedulerWorker.resources Scheduler container resources requests and limits - ## - resources: {} + ## @param schedulerWorker.resources.requests.cpu Scheduler container cpu request + ## @param schedulerWorker.resources.requests.memory Scheduler container memory request + ## @param schedulerWorker.resources.limits.cpu Scheduler container cpu limit + ## @param schedulerWorker.resources.limits.memory Scheduler container memory limit + ## + resources: + requests: + cpu: "250m" + memory: "200Mi" + limits: + cpu: "250m" + memory: "400Mi" ## @param schedulerWorker.podSecurityContext.enabled Enable security context ## @param schedulerWorker.podSecurityContext.runAsUser User ID for the pod ## @param schedulerWorker.podSecurityContext.runAsGroup Group ID for the pod @@ -456,9 +476,18 @@ scheduler: tag: null pullPolicy: IfNotPresent pullSecrets: [] - ## @param scheduler.resources Scheduler container resources requests and limits - ## - resources: {} + ## @param scheduler.resources.requests.cpu Scheduler container cpu request + ## @param scheduler.resources.requests.memory Scheduler container memory request + ## @param scheduler.resources.limits.cpu Scheduler container cpu limit + ## @param scheduler.resources.limits.memory Scheduler container memory limit + ## + resources: + requests: + cpu: "250m" + memory: "200Mi" + limits: + cpu: "250m" + memory: "400Mi" ## @param scheduler.nodeSelector Node labels for pod assignment ## nodeSelector: {} @@ -484,10 +513,10 @@ scheduler: ## @param builder.replicaCount Number of builder replicas ## builder: - ## @param builder.enabled Enable worker service + ## @param builder.enabled Enable builder service ## enabled: true - ## @param builder.replicaCount Replica count for the worker service + ## @param builder.replicaCount Replica count for the builder service ## replicaCount: 1 @@ -524,17 +553,18 @@ builder: fsGroup: 1001 - ## @param builder.resources Builder container resources requests and limits - ## e.g: - ## resources: - ## limits: - ## cpu: 100m - ## memory: 128Mi - ## requests: - ## cpu: 100m - ## memory: 128Mi + ## @param builder.resources.requests.cpu Builder container cpu request + ## @param builder.resources.requests.memory Builder container memory request + ## @param builder.resources.limits.cpu Builder container cpu limit + ## @param builder.resources.limits.memory Builder container memory limit ## - resources: {} + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "2000m" + memory: "8Gi" ## @param builder.nodeSelector Node labels for pod assignment ## @@ -843,6 +873,13 @@ postgresql: capabilities: drop: - ALL + resources: + requests: + cpu: "1000m" + memory: "2Gi" + limits: + cpu: "1000m" + memory: "4Gi" ## @skip redis ## @@ -857,6 +894,13 @@ redis: service: ports: redis: 6379 + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "500m" + memory: "1024Mi" replica: replicaCount: 0 commonConfiguration: |- @@ -872,10 +916,17 @@ docker-registry: storage: filesystem persistence: enabled: true - size: 10Gi + size: 50Gi deleteEnabled: true service: type: NodePort + resources: + requests: + cpu: "500m" + memory: "16Gi" + limits: + cpu: "500m" + memory: "64Gi" ## @skip minio ## @@ -896,6 +947,13 @@ minio: capabilities: drop: - ALL + resources: + requests: + cpu: "500m" + memory: "16Gi" + limits: + cpu: "1000m" + memory: "64Gi" ## @skip localstack ## @@ -904,6 +962,13 @@ localstack: service: edgeService: nodePort: "" + resources: + requests: + cpu: "500m" + memory: "16Gi" + limits: + cpu: "500m" + memory: "64Gi" environment: - name: SERVICES value: s3