huggingface · dacorvo · Feb 7, 2024 · Nov 23, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -0,0 +1,40 @@
+name: Optimum neuron / Test TGI on INF2
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - "text-generation-inference/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - "text-generation-inference/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run TGI tests
+    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install python and create venv
+        run: |
+          sudo apt install python3.8-venv -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+      - name: Run TGI server python tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_test
diff --git a/Makefile b/Makefile
@@ -24,7 +24,7 @@ clean:
 
 rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))
 
-VERSION := $(shell python -W ignore -c "from optimum.neuron.version import __version__; print(__version__)")
+VERSION := $(shell gawk 'match($$0, /__version__ = "(.*)"/, a) {print a[1]}' optimum/neuron/version.py)
 
 PACKAGE_DIST = dist/optimum-neuron-$(VERSION).tar.gz
 PACKAGE_WHEEL = dist/optimum_neuron-$(VERSION)-py3-none-any.whl
@@ -71,6 +71,20 @@ build_dist: ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 pypi_upload: ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 	python -m twine upload ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 
+# Tests
+
 test_installs:
 	python -m pip install .[tests]
 	python -m pip install git+https://github.com/huggingface/transformers.git
+
+# Stand-alone TGI server for unit tests outside of TGI container
+tgi_server:
+	python -m pip install -r text-generation-inference/server/build-requirements.txt
+	make -C text-generation-inference/server clean
+	VERSION=${VERSION} make -C text-generation-inference/server gen-server
+
+tgi_test: tgi_server
+	python -m pip install .[neuronx] pytest
+	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
+	                               -exec python -m pip install --force-reinstall {} \;
+	python -m pytest -s text-generation-inference/tests
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
@@ -51,6 +51,10 @@ RUN apt-get update -y \
     && apt-get clean
 RUN pip3 --no-cache-dir install --upgrade pip
 
+# VERSION is a mandatory parameter
+ARG VERSION
+RUN test -n ${VERSION:?}
+
 # Python server build image
 FROM base AS pyserver
 
@@ -66,15 +70,11 @@ WORKDIR /pyserver
 COPY text-generation-inference/server server
 COPY --from=tgi /tgi/proto proto
 RUN pip3 install -r server/build-requirements.txt
-RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server gen-server
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server
 
 # Neuron base image (used for deployment)
 FROM base AS neuron
 
-# VERSION is a mandatory parameter
-ARG VERSION
-RUN test -n ${VERSION:?}
-
 # Install system prerequisites
 RUN apt-get update -y \
  && apt-get install -y --no-install-recommends \

diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
@@ -1,6 +1,7 @@
 # Initialize base variables
 pkg_name := text_generation_server
-BUILDDIR ?= $(CURDIR)/build_$(pkg_name)
+BUILDDIR ?= $(CURDIR)/build
+VERSION ?= 0.0.1
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)
@@ -13,14 +14,6 @@ src_dir := $(mkfile_dir)/$(pkg_name)
 sources := $(wildcard $(src_dir)/*.py)
 deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))
 
-# Three python files are generated for each protobuf
-protobufs := $(wildcard $(PROTODIR)/*.proto)
-pkg_pb_dir := $(pkg_dir)/pb
-generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
-generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
-generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
-generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
-
 # Static files are just copied
 
 define COPY
@@ -30,18 +23,37 @@ endef
 $(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml
 	mkdir -p $(BUILDDIR)
 	$(COPY)
+	sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@
 
 $(pkg_dir)/%.py: $(src_dir)/%.py
 	mkdir -p $(pkg_dir)
 	$(COPY)
 
 # Generated files are produced by grpcio tools
 
+# If not provided, fetch proto files from TGI
+ifndef PROTODIR
+PROTODIR := $(BUILDDIR)/tgi/proto
+endif
+
+$(BUILDDIR)/tgi/proto/%.proto:
+	install -d $(BUILDDIR)/tgi
+	curl -L https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
+	tar -C $(BUILDDIR)/tgi -xf $(BUILDDIR)/tgi/sources.tar.gz --strip-components=1
+
+# Three python files are generated for each protobuf
+protobufs := $(PROTODIR)/generate.proto
+pkg_pb_dir := $(pkg_dir)/pb
+generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
+generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
+
 $(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
 	mkdir -p $(pkg_pb_dir)
 	python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
 		--grpc_python_out=$(pkg_pb_dir) --mypy_out=$(pkg_pb_dir) $^
 	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
 
 gen-server: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
-	python -m build $(BUILDDIR) --sdist
+	python -m build $(BUILDDIR)
diff --git a/text-generation-inference/server/pyproject.toml b/text-generation-inference/server/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "text-generation-server"
-version = "0.0.1"
+version = "VERSION"
 authors = [{name="David Corvoysier", email="[email protected]" }]
 description = "TGI compatible inference server for AWS Neuronx platforms"
 dependencies = [
@@ -18,5 +18,8 @@ dependencies = [
     'loguru == 0.6.0'
 ]
 
+[tool.setuptools]
+packages = ["text_generation_server", "text_generation_server.pb"]
+
 [project.scripts]
 text-generation-server = 'text_generation_server.cli:app'
diff --git a/text-generation-inference/tests/test_generator_slot.py b/text-generation-inference/tests/test_generator_slot.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from text_generation_server.generator import Slot
+from text_generation_server.pb.generate_pb2 import Request
+from transformers import AutoTokenizer, GenerationConfig
+
+
+TOKENIZERS = ["NousResearch/Llama-2-7b-hf", "gpt2"]
+
+
+@pytest.fixture(params=TOKENIZERS)
+def tokenizer(request):
+    t = AutoTokenizer.from_pretrained(request.param)
+    t.padding_side = "left"
+    t.pad_token_id = t.eos_token_id
+    return t
+
+
+@pytest.mark.parametrize(
+    "input_text, generated_text",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            " Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind,"
+            " slipped quickly through the glass doors of Victory Mansions, though not quickly enough"
+            " to prevent a swirl of gritty dust from entering along with him.",
+        ],
+        ["This sentence is written in chinese:", "我很感谢你的热情"],
+        ["Some text might contain a lot of emojis like 😃", "😍💪 👉 👀"],
+    ],
+    ids=["spaces", "chinese-utf8", "emojis"],
+)
+def test_decode_streaming(tokenizer, input_text, generated_text):
+    slot = Slot(0, tokenizer)
+    request = Request(id=0, inputs=input_text)
+    slot.assign(request, GenerationConfig())
+    assert slot.cached_text == input_text
+
+    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    input_ids = inputs["input_ids"][0]
+    attention_mask = inputs["attention_mask"][0]
+    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
+
+    # We need to regenerate the full text as the tokenizer might change it (extra spaces might be added)
+    all_input_ids = torch.cat([input_ids, torch.tensor(generated_tokens)])
+    full_text = tokenizer.decode(all_input_ids, skip_special_tokens=True)
+    regenerated_text = full_text[len(input_text) :]
+
+    # Initialize the slot with the inputs
+    slot.reset(input_ids, attention_mask, selector=None)
+
+    assert slot.generated_tokens == 0
+
+    # Simulate an iterative generation (i.e. don't call select and use known tokens instead)
+    decoded_text = ""
+    for i in range(len(generated_tokens)):
+        text = slot.append(generated_tokens[i])
+        assert slot.generated_tokens == i + 1
+        decoded_text += text
+
+    assert decoded_text == regenerated_text