Set up tgi environment values with the ones used to build the model (#…

…529) * Set up tgi environment values with the ones used to build the model Need this to workaround the model static params, for the docker entrypoint to adapt tgi environment accordingly to the specified model This will make usage of the image easier: default params (e.g not specifying anything) should be enough for most models Signed-off-by: Raphael Glon <[email protected]> * fixes Signed-off-by: Raphael Glon <[email protected]> * fixes Signed-off-by: Raphael Glon <[email protected]> * minor: logging Signed-off-by: Raphael Glon <[email protected]> * Integration tests for inf2 + tgi_env wrapper Signed-off-by: Raphael Glon <[email protected]> * Github ci worklow Signed-off-by: Raphael Glon <[email protected]> * To run on github ci we cannot share a volume but need to embed the model within an image built on the flight Signed-off-by: Raphael Glon <[email protected]> * More flexible on expected outputs Signed-off-by: Raphael Glon <[email protected]> * Be more flexible about compiler version Signed-off-by: Raphael Glon <[email protected]> * Refacto bump dev version, use a single workflow for TGI, simplify a bit the implicit env test Signed-off-by: Raphael Glon <[email protected]> * Misc fixes Signed-off-by: Raphael Glon <[email protected]> --------- Signed-off-by: Raphael Glon <[email protected]> Co-authored-by: Raphael Glon <[email protected]>
huggingface · Apr 9, 2024 · bb1cc96 · bb1cc96
1 parent 1f049e1
commit bb1cc96
Show file tree

Hide file tree

Showing 8 changed files with 387 additions and 42 deletions.
diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -50,3 +50,8 @@ jobs:
           sudo apt install gawk -y
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_test
+      - name: Run integration tests
+        shell: bash
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_docker_test
diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.0.21.dev0"
+__version__ = "0.0.22.dev0"
 
 __sdk_version__ = "2.18.0"
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
@@ -53,14 +53,12 @@ RUN apt-get update -y \
     && apt-get clean
 RUN pip3 --no-cache-dir install --upgrade pip
 
-# VERSION is a mandatory parameter
-ARG VERSION
-RUN test -n ${VERSION:?}
-
 # Python server build image
 FROM base AS pyserver
 
-RUN apt-get update -y \
+ARG VERSION
+
+RUN test -n ${VERSION:?} && apt-get update -y \
  && apt-get install -y --no-install-recommends \
     make \
     python3-venv \
@@ -77,8 +75,10 @@ RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSIO
 # Neuron base image (used for deployment)
 FROM base AS neuron
 
+ARG VERSION
+
 # Install system prerequisites
-RUN apt-get update -y \
+RUN test -n ${VERSION:?} && apt-get update -y \
  && apt-get install -y --no-install-recommends \
     gnupg2 \
     wget \
@@ -108,7 +108,7 @@ RUN pip3 install \
 
 # Install HuggingFace packages
 RUN pip3 install \
-    hf_transfer
+    hf_transfer huggingface_hub
 
 # Install optimum-neuron
 COPY dist/optimum-neuron-${VERSION}.tar.gz optimum-neuron.tar.gz
@@ -137,6 +137,6 @@ ENTRYPOINT ["./entrypoint.sh"]
 
 # Final image
 FROM neuron
-
-ENTRYPOINT ["text-generation-launcher"]
+COPY text-generation-inference/tgi-entrypoint.sh text-generation-inference/tgi_env.py /
+ENTRYPOINT ["/tgi-entrypoint.sh"]
 CMD ["--json-output"]
diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
@@ -1,12 +1,13 @@
 import asyncio
 import contextlib
+import logging
 import os
 import random
-import shlex
-import subprocess
+import re
+import string
 import sys
+import tempfile
 import time
-from tempfile import TemporaryDirectory
 from typing import List
 
 import docker
@@ -17,9 +18,10 @@
 from text_generation.types import Response
 
 
+LOG = logging.getLogger(__file__)
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "neuronx-tgi:latest")
 HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
-DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+DOCKER_LOCAL_DIR_PATTERN = re.compile(r"^/data/(.*)$")
 
 
 class LauncherHandle:
@@ -71,15 +73,7 @@ def event_loop():
 
 
 @pytest.fixture(scope="module")
-def data_volume():
-    tmpdir = TemporaryDirectory()
-    yield tmpdir.name
-    # Cleanup the temporary directory using sudo as it contains root files created by the container
-    subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"))
-
-
-@pytest.fixture(scope="module")
-def launcher(event_loop, data_volume):
+def launcher(event_loop):
     @contextlib.contextmanager
     def docker_launcher(
         model_id: str,
@@ -112,33 +106,70 @@ def docker_launcher(
             if var in os.environ:
                 env[var] = os.environ[var]
 
-        volumes = [f"{data_volume}:/data"]
+        # Workaround to bypass docker dind issues preventing to share a volume from the container running tests
+        # to another
+        m = DOCKER_LOCAL_DIR_PATTERN.match(model_id)
+        if m:
+            local_dir = model_id
+            real_model_id = m.group(1)
+            docker_content = f"""
+            FROM {DOCKER_IMAGE}
+            RUN huggingface-cli download --local-dir {local_dir} {real_model_id}
+            """
+
+            docker_tag = "awesome-workaround:{}".format(
+                "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(5))
+            )
+            LOG.info(
+                "Building image on the flight derivated from %s, tagged with %s",
+                DOCKER_IMAGE,
+                docker_tag,
+            )
+            with tempfile.NamedTemporaryFile() as f:
+                f.write(docker_content.encode("utf-8"))
+                f.flush()
+                image, logs = client.images.build(path=".", dockerfile=f.name, tag=docker_tag)
+            LOG.info("Successfully built image %s", image.id)
+            LOG.debug("Build logs %s", logs)
+        else:
+            docker_tag = DOCKER_IMAGE
+            image = None
 
         container = client.containers.run(
-            DOCKER_IMAGE,
+            docker_tag,
             command=args,
             name=container_name,
             environment=env,
             auto_remove=False,
             detach=True,
             devices=["/dev/neuron0"],
-            volumes=volumes,
             ports={"80/tcp": port},
             shm_size="1G",
         )
 
         yield ContainerLauncherHandle(client, container.name, port)
 
         try:
-            container.stop()
-            container.wait()
-        except NotFound:
-            pass
-
-        container_output = container.logs().decode("utf-8")
-        print(container_output, file=sys.stderr)
-
-        container.remove()
+            try:
+                container.stop()
+                container.wait()
+            except NotFound:
+                pass
+            container_output = container.logs().decode("utf-8")
+            print(container_output, file=sys.stderr)
+
+            container.remove()
+        finally:
+            # Cleanup the build image
+            if image:
+                LOG.info("Cleaning image %s", image.id)
+                try:
+                    image.remove(force=True)
+                except NotFound:
+                    pass
+                except Exception as e:
+                    LOG.error("Error while removing image %s, skiping", image.id)
+                    LOG.exception(e)
 
     return docker_launcher
 

diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py
@@ -1,6 +1,5 @@
 import os
 
-import huggingface_hub
 import Levenshtein
 import pytest
 
@@ -13,7 +12,7 @@
 
 
 @pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
-def model_name_or_path(request, data_volume):
+def model_name_or_path(request):
     if request.param == "hub":
         os.environ["HF_BATCH_SIZE"] = str(BATCH_SIZE)
         os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
@@ -22,11 +21,7 @@ def model_name_or_path(request, data_volume):
     elif request.param == "hub-neuron":
         yield NEURON_MODEL_ID
     else:
-        model_dir = f"gpt2-neuron-{BATCH_SIZE}x{SEQUENCE_LENGTH}x{NUM_CORES}"
-        local_path = os.path.join(data_volume, model_dir)
-        huggingface_hub.snapshot_download(NEURON_MODEL_ID, local_dir=local_path)
-        # Return the path of the model inside the mounted volume
-        yield os.path.join("/data", model_dir)
+        yield os.path.join("/data", NEURON_MODEL_ID)
 
 
 @pytest.fixture(scope="module")

diff --git a/text-generation-inference/integration-tests/test_implicit_env.py b/text-generation-inference/integration-tests/test_implicit_env.py
@@ -0,0 +1,80 @@
+import os
+
+import pytest
+from text_generation.errors import ValidationError
+
+
+# These tests will often break as it relies on many factors like the optimum version, the neuronx-cc version,
+# and on what is synced in the cache for these specific versions...
+
+MODELS = ["openai-community/gpt2", "aws-neuron/gpt2-neuronx-bs4-seqlen1024"]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def get_model_and_set_env(request):
+    # the tgi_env.py script will take care of setting these
+    for var in [
+        "MAX_BATCH_SIZE",
+        "MAX_INPUT_LENGTH",
+        "MAX_TOTAL_TOKEN",
+        "HF_BATCH_SIZE",
+        "HF_NUM_CORES",
+        "HF_SEQUENCE_LENGTH",
+        "HF_AUTO_CAST_TYPE",
+    ]:
+        if var in os.environ:
+            del os.environ[var]
+    yield request.param
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, get_model_and_set_env):
+    with launcher(get_model_and_set_env) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service):
+    await tgi_service.health(300)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_client):
+
+    # Just verify that the generation works, and nothing is raised, with several set of params
+
+    # No params
+    await tgi_client.generate(
+        "What is Deep Learning?",
+    )
+
+    response = await tgi_client.generate(
+        "How to cook beans ?",
+        max_new_tokens=17,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+
+    # check error
+    try:
+        await tgi_client.generate("What is Deep Learning?", max_new_tokens=170000)
+    except ValidationError:
+        pass
+    else:
+        raise AssertionError(
+            "The previous text generation request should have failed, "
+            "because too many tokens were requested, it succeeded"
+        )
+
+    # Sampling
+    await tgi_client.generate(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=1000,
+        seed=42,
+        decoder_input_details=True,
+    )
diff --git a/text-generation-inference/tgi-entrypoint.sh b/text-generation-inference/tgi-entrypoint.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e -o pipefail -u
+
+export ENV_FILEPATH=$(mktemp)
+
+trap "rm -f ${ENV_FILEPATH}" EXIT
+
+touch $ENV_FILEPATH
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+${SCRIPT_DIR}/tgi_env.py $@
+
+source $ENV_FILEPATH
+
+text-generation-launcher $@