Feat!: Remove SDK installation from our base image (#443)

# The problem We had to regenerate SDK images each release https://zapatacomputing.atlassian.net/browse/ORQSDK-1032 # This PR's solution Remove installation of SDK in our worker base images # Checklist _Check that this PR satisfies the following items:_ - [x] Tests have been added for new features/changed behavior (if no new features have been added, check the box). - [ ] The [changelog file](CHANGELOG.md) has been updated with a user-readable description of the changes (if the change isn't visible to the user in any way, check the box). - [x] The PR's title is prefixed with `<feat/fix/chore/imp[rovement]/int[ernal]/docs>[!]:` - [x] The PR is linked to a JIRA ticket (if there's no suitable ticket, check the box).
zapata-engineering · Sep 25, 2024 · 26daea6 · 26daea6
1 parent c8929cb
commit 26daea6
Show file tree

Hide file tree

Showing 18 changed files with 192 additions and 298 deletions.
diff --git a/.github/workflows/orquestra-sdk-integration.yml b/.github/workflows/orquestra-sdk-integration.yml
@@ -49,7 +49,7 @@ jobs:
           source ./venv/bin/activate
           make github-actions-integration
 
-      - name: Run performance test
+      - name: Run integration test
         shell: bash
         run: |
           source ./venv/bin/activate

diff --git a/.github/workflows/publish-cuda-docker-image.yml b/.github/workflows/publish-cuda-docker-image.yml
@@ -3,14 +3,6 @@ name: Publish orquestra-sdk-base Docker Image with CUDA support
 on:
   workflow_dispatch:
     inputs:
-      sdk_requirement:
-        type: string
-        description: |
-          Orquestra SDK requirement to build the image with. 
-          Example of using a published version of the SDK:
-          orquestra-sdk[ray]==0.63.0
-          Example of using an unpublished version:
-          orquestra-sdk[ray] @ git+https://github.com/zapata-engineering/[email protected]#subdirectory=projects/orquestra-sdk
       docker_tag:
         type: string
         description: |
@@ -81,6 +73,6 @@ jobs:
           # as on the command line `--build-arg`, one to a line:
           # docker_build_args: BUILD_ARG_ONE=value1\nBUILD_ARG_TWO=value2
           # DUE TO JSON LIMITATIONS, NEWLINES MUST BE EXPLICITLY ADDED AS \n
-          docker_build_args: SDK_REQUIREMENT=${{ inputs.sdk_requirement }}
+          docker_build_args:
           # leave blank for linux/amd64 (recommended)
           target_platforms: ''
diff --git a/.github/workflows/publish-docker-image.yml b/.github/workflows/publish-docker-image.yml
@@ -3,14 +3,6 @@ name: Publish orquestra-sdk-base Docker Image
 on:
   workflow_dispatch:
     inputs:
-      sdk_requirement:
-        type: string
-        description: |
-          Orquestra SDK requirement to build the image with. 
-          Example of using a published version of the SDK:
-          orquestra-sdk[ray]==0.63.0
-          Example of using an unpublished version:
-          orquestra-sdk[ray] @ git+https://github.com/zapata-engineering/[email protected]#subdirectory=projects/orquestra-sdk
       docker_tag:
         type: string
         description: |
@@ -79,6 +71,6 @@ jobs:
           # as on the command line `--build-arg`, one to a line:
           # docker_build_args: BUILD_ARG_ONE=value1\nBUILD_ARG_TWO=value2
           # DUE TO JSON LIMITATIONS, NEWLINES MUST BE EXPLICITLY ADDED AS \n
-          docker_build_args: SDK_REQUIREMENT=${{ inputs.sdk_requirement }}
+          docker_build_args:
           # leave blank for linux/amd64 (recommended)
           target_platforms: 'linux/amd64,linux/arm64'
diff --git a/projects/orquestra-sdk/docker/Dockerfile b/projects/orquestra-sdk/docker/Dockerfile
@@ -2,7 +2,6 @@
 # Base image for running Orquestra tasks.
 # Published at hub.nexus.orquestra.io/zapatacomputing/orquestra-sdk-base
 FROM python:3.11.6-slim-bullseye
-ARG SDK_REQUIREMENT
 
 # Set by BuildKit
 ARG TARGETARCH
@@ -38,7 +37,7 @@ RUN <<EOF
 set -ex
 . "$VIRTUAL_ENV/bin/activate"
 python -m pip install --no-cache-dir -U pip wheel
-python -m pip install --no-cache-dir "${SDK_REQUIREMENT}"
+python -m pip install --no-cache-dir ray[default]==2.30
 EOF
 
 # Prefer to use pip, python, and other binaries from the virtual env.

diff --git a/projects/orquestra-sdk/docker/cuda.Dockerfile b/projects/orquestra-sdk/docker/cuda.Dockerfile
@@ -8,7 +8,6 @@
 ARG CUDA_MINOR_VERSION=11.8
 FROM nvidia/cuda:${CUDA_MINOR_VERSION}.0-runtime-ubuntu22.04
 
-ARG SDK_REQUIREMENT
 ARG PYTHON_VERSION=3.11.6
 ARG TARGETARCH="amd64"
 
@@ -56,7 +55,7 @@ RUN <<EOF
 set -ex
 . "$VIRTUAL_ENV/bin/activate"
 python -m pip install --no-cache-dir -U pip wheel
-python -m pip install --no-cache-dir "${SDK_REQUIREMENT}"
+python -m pip install --no-cache-dir ray[default]==2.30
 EOF
 
 # Prefer to use pip, python, and other binaries from the virtual env.

diff --git a/projects/orquestra-sdk/docs/examples/tests/test_dependency_installation.py b/projects/orquestra-sdk/docs/examples/tests/test_dependency_installation.py
@@ -197,7 +197,9 @@ def test_good_practice_defaults():
         # Then
         assert src_import.type == "INLINE_IMPORT"
         assert len(deps_imports) == 1
-        assert deps_imports[0].type == "PYTHON_IMPORT"
+        # it depends if you have installed SDK with version from PyPi, or
+        # its local-dev version, it changes what import is used for SDK
+        assert deps_imports[0].type in {"PYTHON_IMPORT", "GIT_IMPORT"}
 
     @staticmethod
     def test_good_practice_python_imports():
@@ -212,7 +214,9 @@ def test_good_practice_python_imports():
         assert deps_imports is not None
         assert len(deps_imports) == 2
         assert deps_imports[0].type == "PYTHON_IMPORT"
-        assert deps_imports[1].type == "PYTHON_IMPORT"
+        # it depends if you have installed SDK with version from PyPi, or
+        # its local-dev version, it changes what import is used for SDK
+        assert deps_imports[1].type in {"PYTHON_IMPORT", "GIT_IMPORT"}
 
     @staticmethod
     def test_good_practice_git_import_with_auth():
@@ -225,7 +229,9 @@ def test_good_practice_git_import_with_auth():
         # Then
         assert src_import.type == "GIT_IMPORT"
         assert len(deps_imports) == 1
-        assert deps_imports[0].type == "PYTHON_IMPORT"
+        # it depends if you have installed SDK with version from PyPi, or
+        # its local-dev version, it changes what import is used for SDK
+        assert deps_imports[0].type in {"PYTHON_IMPORT", "GIT_IMPORT"}
 
     @staticmethod
     def test_simple_task_explicit():
@@ -239,7 +245,9 @@ def test_simple_task_explicit():
         # Then
         assert src_import.type == "INLINE_IMPORT"
         assert len(deps_imports) == 1
-        assert deps_imports[0].type == "PYTHON_IMPORT"
+        # it depends if you have installed SDK with version from PyPi, or
+        # its local-dev version, it changes what import is used for SDK
+        assert deps_imports[0].type in {"PYTHON_IMPORT", "GIT_IMPORT"}
 
     @staticmethod
     def test_python_imports(monkeypatch, tmp_path):
@@ -253,13 +261,14 @@ def test_python_imports(monkeypatch, tmp_path):
         for task in [task1, task2, task3]:
             # When
             src_import, deps_imports = _import_models(task)
-
             # Then
             assert src_import.type == "INLINE_IMPORT"
             assert deps_imports is not None
             assert len(deps_imports) == 2
             assert deps_imports[0].type == "PYTHON_IMPORT"
-            assert deps_imports[1].type == "PYTHON_IMPORT"
+            # it depends if you have installed SDK with version from PyPi, or
+            # its local-dev version, it changes what import is used for SDK
+            assert deps_imports[1].type in {"PYTHON_IMPORT", "GIT_IMPORT"}
 
     @staticmethod
     def test_github_import_private_repo():

diff --git a/projects/orquestra-sdk/src/orquestra/sdk/_client/_base/_docker_images.py b/projects/orquestra-sdk/src/orquestra/sdk/_client/_base/_docker_images.py
@@ -0,0 +1,10 @@
+################################################################################
+# © Copyright 2024 Zapata Computing Inc.
+################################################################################
+
+HEAD_NODE_IMAGE = (
+    "hub.stage.nexus.orquestra.io/zapatacomputing/workflow-driver-ray:head-node-1.0.0a3"
+)
+DEFAULT_WORKER_IMAGE = (
+    "hub.stage.nexus.orquestra.io/zapatacomputing/orquestra-sdk-base:worker-1.0.0a1"
+)
diff --git a/projects/orquestra-sdk/src/orquestra/sdk/_client/_base/_traversal.py b/projects/orquestra-sdk/src/orquestra/sdk/_client/_base/_traversal.py
@@ -9,6 +9,7 @@
 import collections.abc
 import hashlib
 import inspect
+import os
 import re
 import typing as t
 import warnings
@@ -23,13 +24,21 @@
 )
 from orquestra.workflow_shared.schema import ir, responses
 from orquestra.workflow_shared.secrets import Secret
+from packaging import version
 from pip_api._parse_requirements import Requirement
 
-from . import _dsl, _workflow
+from . import _docker_images, _dsl, _workflow
 
 N_BYTES_IN_HASH = 8
 
 
+def _get_default_image(num_gpus: t.Optional[str]):
+    image = _docker_images.DEFAULT_WORKER_IMAGE
+    if num_gpus:
+        image = f"{image}-cuda"
+    return image
+
+
 def _make_key(obj: t.Any):
     """Returns a hashable key for all types.
 
@@ -766,6 +775,23 @@ def _make_invocation_model(
         arg_name: graph.get_node_id(arg_val) for arg_name, arg_val in invocation.kwargs
     }
 
+    gpu_used: t.Optional[str]
+    if invocation.resources.gpu:
+        gpu_used = invocation.resources.gpu
+    elif task_models_dict[invocation.task].resources is not None:
+        task_model = task_models_dict[invocation.task]
+        # this is just to silence pyright which doesn't believe elif check
+        # we dont multiprocess that variables, so we dont have race conditions here
+        assert task_model.resources is not None
+        gpu_used = task_model.resources.gpu
+    else:
+        gpu_used = None
+
+    custom_image = (
+        invocation.custom_image
+        or task_models_dict[invocation.task].custom_image
+        or _get_default_image(gpu_used)
+    )
     return ir.TaskInvocation(
         id=_make_invocation_id(
             task_models_dict[invocation.task].fn_ref.function_name,
@@ -777,7 +803,7 @@ def _make_invocation_model(
         kwargs_ids=kwargs_ids,
         output_ids=graph.output_ids_for_invocation(invocation),
         resources=_make_resources_model(invocation.resources),
-        custom_image=invocation.custom_image,
+        custom_image=custom_image,
         env_vars=invocation.env_vars,
     )
 
@@ -844,7 +870,30 @@ def flatten_graph(
                 else:
                     import_models_dict[imp] = _make_import_model(imp)
 
-    sdk_python_import = _dsl.InstalledImport(package_name="orquestra-sdk")
+    sdk_version = get_current_sdk_version()
+    sdk_version_parsed = version.parse(sdk_version.original)
+
+    if not (sdk_version_parsed.is_devrelease or sdk_version_parsed.is_prerelease):
+        sdk_python_import = _dsl.PythonImports(
+            f"orquestra-sdk[all]=={sdk_version.original}"
+        )
+    else:
+        # this only happens on dev environment, should not happen on users' machines
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="You're working on detached HEAD")
+            warnings.filterwarnings("ignore", message="You have uncommitted changes")
+            # this is a workaround where .model is called outside of SDK repo
+            # its save to do as this should only happen if SDK is installed as editable
+            # install (or from git repository)
+            path_to_sdk = os.path.realpath(__file__)
+            git_ref = _dsl.infer_git_ref(path_to_sdk).resolve()
+            sdk_python_import = _dsl.GithubImport(
+                git_ref=git_ref,
+                repo="zapata-engineering/orquestra-sdk",
+                package_name="orquestra-sdk",
+                extras="all",
+            )
+
     ir_sdk_import = _make_import_model(sdk_python_import)
     import_models_dict[sdk_python_import] = ir_sdk_import
 
@@ -871,12 +920,14 @@ def flatten_graph(
     }
 
     sdk_version = get_current_sdk_version()
+
     python_version = get_current_python_version()
 
     return ir.WorkflowDef(
         metadata=ir.WorkflowMetadata(
             sdk_version=sdk_version,
             python_version=python_version,
+            head_node_image=_docker_images.HEAD_NODE_IMAGE,
         ),
         resources=_make_resources_model(workflow_def._resources, is_task=False),
         # At the moment 'orq submit workflow-def <name>' assumes that the <name> is

diff --git a/projects/orquestra-sdk/tests/integration/ray/test_integration.py b/projects/orquestra-sdk/tests/integration/ray/test_integration.py
@@ -947,8 +947,9 @@ def test_setting_resources(
             # Then
             calls = client.add_options.call_args_list
 
-            # We should only have two calls: our invocation and the aggregation step
-            assert len(calls) == 2
+            # We should only have two calls: invocation aggregation step and aggregation
+            # error handling step
+            assert len(calls) == 3
             # Checking our call did not have any resources included
             assert calls[0] == call(
                 ANY,
@@ -975,15 +976,15 @@ def test_setting_resources(
                     None,
                     None,
                     {
-                        "image:hub.nexus.orquestra.io/zapatacomputing/orquestra-sdk-base:mocked": 1  # noqa: E501
+                        "image:hub.stage.nexus.orquestra.io/zapatacomputing/orquestra-sdk-base:worker-1.0.0a1": 1  # noqa: E501
                     },
                     {},
                 ),
                 (
                     None,
                     1,
                     {
-                        "image:hub.nexus.orquestra.io/zapatacomputing/orquestra-sdk-base:mocked-cuda": 1  # noqa: E501
+                        "image:hub.stage.nexus.orquestra.io/zapatacomputing/orquestra-sdk-base:worker-1.0.0a1-cuda": 1  # noqa: E501
                     },
                     {
                         "num_gpus": 1,
@@ -1021,8 +1022,9 @@ def test_with_env_set(
                 # Then
                 calls = client.add_options.call_args_list
 
-                # We should only have two calls: our invocation and the aggregation step
-                assert len(calls) == 2
+                # We should only have two calls: invocation and the aggregation step
+                # and error-handling aggregation step
+                assert len(calls) == 3
                 # Checking our call did not have any resources included
                 assert calls[0] == call(
                     ANY,
@@ -1055,8 +1057,10 @@ def test_with_env_not_set(
                 # Then
                 calls = client.add_options.call_args_list
 
-                # We should only have two calls: our invocation and the aggregation step
-                assert len(calls) == 2
+                # We should only have three calls:
+                # We should only have two calls: invocation and the aggregation step
+                # and error-handling aggregation step
+                assert len(calls) == 3
                 # Checking our call did not have any resources included
                 assert calls[0] == call(
                     ANY,

diff --git a/projects/orquestra-sdk/tests/sdk/driver/test_client.py b/projects/orquestra-sdk/tests/sdk/driver/test_client.py
@@ -4,6 +4,7 @@
 """
 Tests for orquestra.sdk._base._driver._client.
 """
+import warnings
 from datetime import timedelta
 from typing import Any, Dict, List, Optional
 from unittest.mock import Mock, create_autospec
@@ -12,11 +13,12 @@
 import pytest
 import responses
 from orquestra.workflow_shared._spaces._structs import ProjectRef
-from orquestra.workflow_shared.schema.ir import WorkflowDef
+from orquestra.workflow_shared.schema.ir import Version, WorkflowDef
 from orquestra.workflow_shared.schema.responses import JSONResult, PickleResult
 from orquestra.workflow_shared.schema.workflow_run import RunStatus, State, TaskRun
 
 import orquestra.sdk as sdk
+from orquestra.sdk._client._base import _traversal
 from orquestra.sdk._client._base._driver import _exceptions
 from orquestra.sdk._client._base._driver._client import (
     DriverClient,
@@ -132,21 +134,35 @@ def task_run_id(self):
     def task_inv_id(self):
         return "00000000-0000-0000-0000-000000000000"
 
+    @pytest.fixture
+    def mask_sdk_version(self, monkeypatch):
+        monkeypatch.setattr(
+            _traversal,
+            "get_current_sdk_version",
+            lambda *_: Version(
+                original="0.66.0", major=0, minor=66, patch=0, is_prerelease=False
+            ),
+        )
+
     @pytest.fixture
     def status(self):
         return RunStatus(state=State.SUCCEEDED, start_time=None, end_time=None)
 
     @pytest.fixture
     def workflow_def(self):
-        @sdk.task
+        @sdk.task(custom_image="")
         def task():
             return 1
 
         @sdk.workflow
         def workflow():
             return task()
 
-        return workflow().model
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", "Attempting to read a workflow definition"
+            )
+            return workflow().model
 
     class TestWorkflowDefinitions:
         # ------ queries ------
@@ -454,6 +470,7 @@ def endpoint_mocker(endpoint_mocker_base, base_uri: str):
                 ],
             )
             def test_params_encoding(
+                mask_sdk_version,
                 endpoint_mocker,
                 client: DriverClient,
                 workflow_def_id,