diff --git a/backend/image_transfer/encoder.py b/backend/image_transfer/encoder.py index ab046f582..9977ddae9 100644 --- a/backend/image_transfer/encoder.py +++ b/backend/image_transfer/encoder.py @@ -98,6 +98,7 @@ def get_manifests_and_list_of_all_blobs( raise RegistryPreconditionFailedException( f"{docker_image} is either not scanned yet or not passing the vulnerability checks." ) from e + raise e manifests.append(manifest) blobs_to_pull += blobs return manifests, blobs_to_pull diff --git a/backend/substrapp/clients/organization.py b/backend/substrapp/clients/organization.py index 332a0e541..cf447c4a2 100644 --- a/backend/substrapp/clients/organization.py +++ b/backend/substrapp/clients/organization.py @@ -184,7 +184,9 @@ def get( ) -> bytes: """Get asset data.""" content = _http_request(_Method.GET, channel, organization_id, url).content + new_checksum = compute_hash(content, key=salt) + if new_checksum != checksum: raise IntegrityError(f"url {url}: checksum doesn't match {checksum} vs {new_checksum}") return content diff --git a/backend/substrapp/compute_tasks/compute_pod.py b/backend/substrapp/compute_tasks/compute_pod.py index 446448b76..2570fec97 100644 --- a/backend/substrapp/compute_tasks/compute_pod.py +++ b/backend/substrapp/compute_tasks/compute_pod.py @@ -2,6 +2,7 @@ import kubernetes import structlog +import yaml from django.conf import settings from substrapp.kubernetes_utils import delete_pod @@ -120,22 +121,6 @@ def create_pod( **container_optional_kwargs, ) - pod_affinity = kubernetes.client.V1Affinity( - pod_affinity=kubernetes.client.V1PodAffinity( - required_during_scheduling_ignored_during_execution=[ - kubernetes.client.V1PodAffinityTerm( - label_selector=kubernetes.client.V1LabelSelector( - match_expressions=[ - kubernetes.client.V1LabelSelectorRequirement( - key="statefulset.kubernetes.io/pod-name", operator="In", values=[os.getenv("HOSTNAME")] - ) - ] - ), - topology_key="kubernetes.io/hostname", - ) - ] - ) - ) image_pull_secret = os.getenv("DOCKER_CONFIG_SECRET_NAME") if image_pull_secret: @@ -144,7 +129,9 @@ def create_pod( image_pull_secrets = None spec = kubernetes.client.V1PodSpec( restart_policy="Never", - affinity=pod_affinity, + affinity=yaml.safe_load(os.getenv("COMPUTE_POD_AFFINITY")), + node_selector=yaml.safe_load(os.getenv("COMPUTE_POD_NODE_SELECTOR")), + tolerations=yaml.safe_load(os.getenv("COMPUTE_POD_TOLERATIONS")), containers=[container_compute], volumes=volumes + gpu_volume, security_context=get_pod_security_context(), diff --git a/backend/substrapp/compute_tasks/image_builder.py b/backend/substrapp/compute_tasks/image_builder.py index ca7ff295b..fc4d50429 100644 --- a/backend/substrapp/compute_tasks/image_builder.py +++ b/backend/substrapp/compute_tasks/image_builder.py @@ -33,7 +33,6 @@ def push_blob_to_registry(blob: bytes, tag: str) -> None: def load_remote_function_image(function: orchestrator.Function, channel: str) -> None: # Ask the backend owner of the function if it's available container_image_tag = utils.container_image_tag_from_function(function) - function_image_content = organization_client.get( channel=channel, organization_id=function.owner, diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index 972c19c49..11d810262 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog +## [26.9.0] - 2024-07-22 + +# Added + +Configuration of compute pod `affinity`, `nodeSelector` and `toleration` on `values.yaml` file. ## [26.8.3] - 2024-07-16 diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 9f1d3a581..551eccde1 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.8.3 -appVersion: 0.47.0 +version: "26.9.0" +appVersion: "0.47.0" kubeVersion: ">= 1.19.0-0" description: Main package for Substra type: application diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index cb93bc930..4085376a0 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -118,61 +118,68 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.metrics.serviceMonitor.metricRelabelings` | MetricRelabelConfigs to apply to samples before insertion | `[]` | | `server.metrics.serviceMonitor.honorLabels` | Specify honorLabels parameter of the scrape endpoint | `false` | -### Substra worker settings - -| Name | Description | Value | -| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------- | -| `worker.enabled` | Enable worker service | `true` | -| `worker.replicaCount` | Replica count for the worker service | `1` | -| `worker.concurrency` | Maximum amount of tasks to process in parallel | `1` | -| `worker.image.registry` | Substra backend worker image registry | `ghcr.io` | -| `worker.image.repository` | Substra backend worker image repository | `substra/substra-backend` | -| `worker.image.tag` | Substra backend worker image tag (defaults to AppVersion) | `nil` | -| `worker.image.pullPolicy` | Substra backend worker image pull policy | `IfNotPresent` | -| `worker.image.pullSecrets` | Specify image pull secrets | `[]` | -| `worker.podSecurityContext.enabled` | Enable security context | `true` | -| `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | -| `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | -| `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` | -| `worker.resources.requests.memory` | Worker container memory request | `4Gi` | -| `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | -| `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | -| `worker.nodeSelector` | Node labels for pod assignment | `{}` | -| `worker.tolerations` | Toleration labels for pod assignment | `[]` | -| `worker.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | -| `worker.rbac.create` | Create a role for the worker | `true` | -| `worker.serviceAccount.create` | Create a service account for the worker | `true` | -| `worker.serviceAccount.name` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the substra.fullname template | `""` | -| `worker.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | -| `worker.persistence.size` | The size of the volume. The size of this volume should be sufficient to store many assets. | `10Gi` | -| `worker.computePod.maxStartupWaitSeconds` | Set the maximum amount of time we will wait for the compute pod to be ready | `300` | -| `worker.computePod.securityContext.fsGroup` | Set the filesystem group for the Compute pod | `1001` | -| `worker.computePod.securityContext.runAsUser` | Set the user for the Compute pod | `1001` | -| `worker.computePod.securityContext.runAsGroup` | Set the group for the Compute pod | `1001` | -| `worker.computePod.resources.requests.cpu` | Worker compute pod container cpu request | `1000m` | -| `worker.computePod.resources.requests.memory` | Worker compute pod container memory request | `1Gi` | -| `worker.computePod.resources.limits.memory` | Worker compute pod container memory limit | `64Gi` | -| `worker.events.enabled` | Enable event service | `true` | -| `worker.events.image.registry` | Substra event app image registry | `ghcr.io` | -| `worker.events.image.repository` | Substra event app image repository | `substra/substra-backend` | -| `worker.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | -| `worker.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | -| `worker.events.image.pullSecrets` | Specify image pull secrets | `[]` | -| `worker.events.resources.requests.cpu` | Worker events container cpu request | `500m` | -| `worker.events.resources.requests.memory` | Worker events container memory request | `200Mi` | -| `worker.events.resources.limits.cpu` | Worker events container cpu limit | `500m` | -| `worker.events.resources.limits.memory` | Worker events container memory limit | `400Mi` | -| `worker.events.podSecurityContext.enabled` | Enable security context | `true` | -| `worker.events.podSecurityContext.runAsUser` | User ID for the pod | `1001` | -| `worker.events.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | -| `worker.events.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.events.nodeSelector` | Node labels for pod assignment | `{}` | -| `worker.events.tolerations` | Toleration labels for pod assignment | `[]` | -| `worker.events.affinity` | Affinity settings for pod assignment | `{}` | -| `worker.events.rbac.create` | Create a role and service account for the event app | `true` | -| `worker.events.serviceAccount.create` | Create a service account for the event app | `true` | -| `worker.events.serviceAccount.name` | The name of the ServiceAccount to use | `""` | +### Substra worker settings. Note that you can access the worker pod name using $(POD_NAME) and its node using $(NODE_NAME). + +| Name | Description | Value | +| ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ | +| `worker.enabled` | Enable worker service | `true` | +| `worker.replicaCount` | Replica count for the worker service | `1` | +| `worker.concurrency` | Maximum amount of tasks to process in parallel | `1` | +| `worker.image.registry` | Substra backend worker image registry | `ghcr.io` | +| `worker.image.repository` | Substra backend worker image repository | `substra/substra-backend` | +| `worker.image.tag` | Substra backend worker image tag (defaults to AppVersion) | `nil` | +| `worker.image.pullPolicy` | Substra backend worker image pull policy | `IfNotPresent` | +| `worker.image.pullSecrets` | Specify image pull secrets | `[]` | +| `worker.podSecurityContext.enabled` | Enable security context | `true` | +| `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | +| `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | +| `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | +| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` | +| `worker.resources.requests.memory` | Worker container memory request | `4Gi` | +| `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | +| `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | +| `worker.nodeSelector` | Node labels for pod assignment | `{}` | +| `worker.tolerations` | Toleration labels for pod assignment | `[]` | +| `worker.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | +| `worker.rbac.create` | Create a role for the worker | `true` | +| `worker.serviceAccount.create` | Create a service account for the worker | `true` | +| `worker.serviceAccount.name` | The name of the ServiceAccount to use. If not set and create is true, a name is generated using the substra.fullname template | `""` | +| `worker.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | +| `worker.persistence.size` | The size of the volume. The size of this volume should be sufficient to store many assets. | `10Gi` | +| `worker.computePod.maxStartupWaitSeconds` | Set the maximum amount of time we will wait for the compute pod to be ready | `300` | +| `worker.computePod.securityContext.fsGroup` | Set the filesystem group for the Compute pod | `1001` | +| `worker.computePod.securityContext.runAsUser` | Set the user for the Compute pod | `1001` | +| `worker.computePod.securityContext.runAsGroup` | Set the group for the Compute pod | `1001` | +| `worker.computePod.resources.requests.cpu` | Worker compute pod container cpu request | `1000m` | +| `worker.computePod.resources.requests.memory` | Worker compute pod container memory request | `1Gi` | +| `worker.computePod.resources.limits.memory` | Worker compute pod container memory limit | `64Gi` | +| `worker.computePod.nodeSelector` | Node labels for pod assignment | `{}` | +| `worker.computePod.tolerations` | Toleration labels for pod assignment | `[]` | +| `worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchExpressions[0].key` | Pod affinity rule defnition. | `statefulset.kubernetes.io/pod-name` | +| `worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchExpressions[0].operator` | Pod affinity rule defnition. | `In` | +| `worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchExpressions[0].values` | Pod affinity rule defnition. | `["$(POD_NAME)"]` | +| `worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].topologyKey` | Pod affinity rule defnition. | `kubernetes.io/hostname` | +| `worker.events.enabled` | Enable event service | `true` | +| `worker.events.image.registry` | Substra event app image registry | `ghcr.io` | +| `worker.events.image.repository` | Substra event app image repository | `substra/substra-backend` | +| `worker.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | +| `worker.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | +| `worker.events.image.pullSecrets` | Specify image pull secrets | `[]` | +| `worker.events.resources.requests.cpu` | Worker events container cpu request | `500m` | +| `worker.events.resources.requests.memory` | Worker events container memory request | `200Mi` | +| `worker.events.resources.limits.cpu` | Worker events container cpu limit | `500m` | +| `worker.events.resources.limits.memory` | Worker events container memory limit | `400Mi` | +| `worker.events.podSecurityContext.enabled` | Enable security context | `true` | +| `worker.events.podSecurityContext.runAsUser` | User ID for the pod | `1001` | +| `worker.events.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | +| `worker.events.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | +| `worker.events.nodeSelector` | Node labels for pod assignment | `{}` | +| `worker.events.tolerations` | Toleration labels for pod assignment | `[]` | +| `worker.events.affinity` | Affinity settings for pod assignment | `{}` | +| `worker.events.rbac.create` | Create a role and service account for the event app | `true` | +| `worker.events.serviceAccount.create` | Create a service account for the event app | `true` | +| `worker.events.serviceAccount.name` | The name of the ServiceAccount to use | `""` | +| `worker.accessModes` | Access modes for volume | `["ReadWriteOnce"]` | ### Substra periodic tasks worker settings diff --git a/charts/substra-backend/changes/935.changed b/charts/substra-backend/changes/935.changed new file mode 100644 index 000000000..649b6a46f --- /dev/null +++ b/charts/substra-backend/changes/935.changed @@ -0,0 +1 @@ +Compute pod `affinity`, `nodeSelector` and `tolerations` are now configured for environment variable defined in the `values.yaml` file. diff --git a/charts/substra-backend/templates/statefulset-worker.yaml b/charts/substra-backend/templates/statefulset-worker.yaml index 58ec341df..33b9ebc77 100644 --- a/charts/substra-backend/templates/statefulset-worker.yaml +++ b/charts/substra-backend/templates/statefulset-worker.yaml @@ -131,6 +131,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: COMPUTE_POD_AFFINITY + value: {{ toYaml .Values.worker.computePod.affinity | quote }} + - name: COMPUTE_POD_NODE_SELECTOR + value: {{ toYaml .Values.worker.computePod.nodeSelector | quote }} + - name: COMPUTE_POD_TOLERATIONS + value: {{ toYaml .Values.worker.computePod.tolerations | quote }} - name: COMPUTE_POD_RESOURCES value: {{ toYaml .Values.worker.computePod.resources | quote }} - name: COMPUTE_POD_MAX_STARTUP_WAIT_SECONDS @@ -231,7 +241,7 @@ spec: - metadata: name: subtuple spec: - accessModes: [ "ReadWriteOnce" ] + accessModes: {{ .Values.worker.accessModes }} {{ include "common.storage.class" .Values.worker.persistence }} resources: requests: diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 497a61d9e..0e5f19c49 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -283,7 +283,7 @@ server: ## honorLabels: false -## @section Substra worker settings +## @section Substra worker settings. Note that you can access the worker pod name using $(POD_NAME) and its node using $(NODE_NAME). ## worker: ## @param worker.enabled Enable worker service @@ -376,6 +376,27 @@ worker: memory: "1Gi" limits: memory: "64Gi" + ## @param worker.computePod.nodeSelector Node labels for pod assignment + ## + nodeSelector: {} + ## @param worker.computePod.tolerations Toleration labels for pod assignment + ## + tolerations: [] + ## @param worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchExpressions[0].key Pod affinity rule defnition. + ## @param worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchExpressions[0].operator Pod affinity rule defnition. + ## @param worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchExpressions[0].values Pod affinity rule defnition. + ## @param worker.computePod.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].topologyKey Pod affinity rule defnition. + ## + affinity: + podAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: statefulset.kubernetes.io/pod-name + operator: In + values: + - $(POD_NAME) + topologyKey: kubernetes.io/hostname events: ## @param worker.events.enabled Enable event service ## @@ -435,7 +456,9 @@ worker: ## If not set and create is true, a name is generated using the substra.fullname template ## name: "" - + ## @param worker.accessModes Access modes for volume + ## + accessModes: ["ReadWriteOnce"] ## @section Substra periodic tasks worker settings ## schedulerWorker: