huggingface · dacorvo · Feb 8, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 6, 2024
diff --git a/Makefile b/Makefile
@@ -40,12 +40,21 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
+TGI_VERSION ?= 1.4.0
+
 neuronx-tgi: $(PACKAGE_DIST)
-	docker build --rm -f text-generation-inference/Dockerfile --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
+	docker build --rm -f text-generation-inference/Dockerfile \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 -t neuronx-tgi:$(VERSION) .
 	docker tag neuronx-tgi:$(VERSION) neuronx-tgi:latest
 
 neuronx-tgi-sagemaker: $(PACKAGE_DIST)
-	docker build --rm -f text-generation-inference/Dockerfile --target sagemaker --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
+	docker build --rm -f text-generation-inference/Dockerfile \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 --target sagemaker \
+				 -t neuronx-tgi:$(VERSION) .
 
 # Creates example scripts from Transformers
 transformers_examples:
@@ -81,10 +90,14 @@ test_installs:
 tgi_server:
 	python -m pip install -r text-generation-inference/server/build-requirements.txt
 	make -C text-generation-inference/server clean
-	VERSION=${VERSION} make -C text-generation-inference/server gen-server
+	VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server
 
 tgi_test: tgi_server
 	python -m pip install .[neuronx] pytest
 	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
 	                               -exec python -m pip install --force-reinstall {} \;
 	python -m pytest -s text-generation-inference/tests
+
+tgi_docker_test: neuronx-tgi
+	python -m pip install -r text-generation-inference/integration-tests/requirements.txt
+	python -m pytest -s text-generation-inference/integration-tests
diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from typing import Optional
 
@@ -43,13 +44,16 @@ def __init__(
         eos_token_id: int,
         pad_token_id: int,
         logits_warper: Optional[LogitsProcessorList] = None,
+        seed: Optional[int] = 0,
     ):
         self.mode = mode
         self.logits_processor = logits_processor
         self.stopping_criteria = stopping_criteria
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
         self.logits_warper = logits_warper
+        self.generator = torch.Generator()
+        self.generator.manual_seed(seed)
 
     @classmethod
     def create(
@@ -59,6 +63,7 @@ def create(
         model: GenerationMixin,
         max_seq_length: int,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
+        seed: Optional[int] = 0,
     ) -> "TokenSelector":
         r"""Creates the `TokenSelector` for a specific generation configuration.
 
@@ -74,10 +79,13 @@ def create(
             stopping_criteria (`Optional[transformers.generation.StoppingCriteriaList], defaults to `None`):
                 Custom stopping criteria that complement the default stopping criteria built from arguments and a
                 generation config.
+            seed(`Optional[int]`):
+                The optional seed for sampling. Defaults to zero.
         Return:
             `torch.LongTensor`: A `torch.LongTensor` containing the selected tokens.
         """
         generation_config.validate()
+        generation_config = copy.deepcopy(generation_config)
 
         unsupported_generation_flags = [
             "output_attentions",
@@ -145,6 +153,7 @@ def create(
             logits_warper=logits_warper,
             eos_token_id=eos_token_id,
             pad_token_id=generation_config.pad_token_id,
+            seed=seed,
         )
 
     def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
@@ -171,7 +180,7 @@ def _sample(self, scores: torch.Tensor) -> torch.LongTensor:
 
         # sample
         probs = torch.nn.functional.softmax(scores, dim=-1)
-        next_tokens = torch.multinomial(probs, num_samples=1)
+        next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
         # Convert the filtered tokens to actual vocabulary tokens
         next_tokens = torch.gather(next_token_indices, 1, next_tokens)
         return next_tokens.squeeze(1)
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
@@ -1,7 +1,9 @@
-# Fetch and extract the TGI sources
+# Fetch and extract the TGI sources (TGI_VERSION is mandatory)
 FROM alpine AS tgi
+ARG TGI_VERSION
+RUN test -n ${TGI_VERSION:?}
 RUN mkdir -p /tgi
-ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz /tgi/sources.tar.gz
+ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
 RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)

diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
@@ -0,0 +1,155 @@
+import asyncio
+import contextlib
+import os
+import random
+import shlex
+import subprocess
+import sys
+import time
+from tempfile import TemporaryDirectory
+from typing import List
+
+import docker
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from text_generation import AsyncClient
+from text_generation.types import Response
+
+
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "neuronx-tgi:latest")
+HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.client = AsyncClient(f"http://localhost:{port}")
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        for _ in range(timeout):
+            if not self._inner_health():
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                await self.client.generate("test")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
+                time.sleep(1)
+        raise RuntimeError("Health check failed")
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        container = self.docker_client.containers.get(self.container_name)
+        return container.status in ["running", "created"]
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    loop = asyncio.get_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+def data_volume():
+    tmpdir = TemporaryDirectory()
+    yield tmpdir.name
+    # Cleanup the temporary directory using sudo as it contains root files created by the container
+    subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"))
+
+
+@pytest.fixture(scope="module")
+def launcher(event_loop, data_volume):
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str,
+        trust_remote_code: bool = False,
+    ):
+        port = random.randint(8000, 10_000)
+
+        args = ["--model-id", model_id, "--env"]
+
+        if trust_remote_code:
+            args.append("--trust-remote-code")
+
+        client = docker.from_env()
+
+        container_name = f"tgi-tests-{model_id.split('/')[-1]}"
+
+        try:
+            container = client.containers.get(container_name)
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+
+        if HUGGING_FACE_HUB_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+
+        for var in ["HF_BATCH_SIZE", "HF_SEQUENCE_LENGTH", "HF_AUTOCAST_TYPE", "HF_NUM_CORES"]:
+            if var in os.environ:
+                env[var] = os.environ[var]
+
+        volumes = [f"{data_volume}:/data"]
+
+        container = client.containers.run(
+            DOCKER_IMAGE,
+            command=args,
+            name=container_name,
+            environment=env,
+            auto_remove=False,
+            detach=True,
+            devices=["/dev/neuron0"],
+            volumes=volumes,
+            ports={"80/tcp": port},
+            shm_size="1G",
+        )
+
+        yield ContainerLauncherHandle(client, container.name, port)
+
+        try:
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        container_output = container.logs().decode("utf-8")
+        print(container_output, file=sys.stderr)
+
+        container.remove()
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    async def generate_load_inner(client: AsyncClient, prompt: str, max_new_tokens: int, n: int) -> List[Response]:
+        futures = [
+            client.generate(prompt, max_new_tokens=max_new_tokens, decoder_input_details=True) for _ in range(n)
+        ]
+
+        return await asyncio.gather(*futures)
+
+    return generate_load_inner
diff --git a/text-generation-inference/integration-tests/pytest.ini b/text-generation-inference/integration-tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/text-generation-inference/integration-tests/requirements.txt b/text-generation-inference/integration-tests/requirements.txt
@@ -0,0 +1,18 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+text-generation >= 0.6.0
+pytest >= 7.4.0
+pytest-asyncio >= 0.21.1
+docker >= 6.1.3
+Levenshtein
diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py
@@ -0,0 +1,99 @@
+import os
+
+import huggingface_hub
+import Levenshtein
+import pytest
+
+
+MODEL_ID = "gpt2"
+NEURON_MODEL_ID = "aws-neuron/gpt2-neuronx-bs4-seqlen1024"
+BATCH_SIZE = 4
+SEQUENCE_LENGTH = 1024
+NUM_CORES = 2
+
+
+@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
+def model_name_or_path(request, data_volume):
+    if request.param == "hub":
+        os.environ["HF_BATCH_SIZE"] = str(BATCH_SIZE)
+        os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
+        os.environ["HF_NUM_CORES"] = str(NUM_CORES)
+        yield MODEL_ID
+    elif request.param == "hub-neuron":
+        yield NEURON_MODEL_ID
+    else:
+        model_dir = f"gpt2-neuron-{BATCH_SIZE}x{SEQUENCE_LENGTH}x{NUM_CORES}"
+        local_path = os.path.join(data_volume, model_dir)
+        huggingface_hub.snapshot_download(NEURON_MODEL_ID, local_dir=local_path)
+        # Return the path of the model inside the mounted volume
+        yield os.path.join("/data", model_dir)
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_name_or_path):
+    with launcher(model_name_or_path) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service):
+    await tgi_service.health(300)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_client):
+
+    # Greedy bounded without input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while"
+
+    # Greedy bounded with input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert (
+        response.generated_text
+        == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while"
+    )
+
+    # Sampling
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=1000,
+        seed=42,
+        decoder_input_details=True,
+    )
+    assert "The purpose of the current post is" in response.generated_text
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_client, generate_load):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = "\n\nDeep learning is a new field of research that has been around for a while"
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert Levenshtein.distance(r.generated_text, expected) < 3