From 974f34336bb36b1b64890c191c558a1575372be7 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 12 Sep 2023 10:00:27 +0200 Subject: [PATCH] Add support for Llama inference through NeuronModelForCausalLM (#223) * chore: use latest AWS SDK * chore(tgi): use latest AWS SDK * feat(generate): add support for llama * fix(tgi): return a string for info.dtype * fix(tgi): slot.select should return a scalar * fix(tgi): insert leading space in next token text when needed * fix(NeuronGenerationMixin): remove Marian hack --- optimum/exporters/neuron/model_configs.py | 6 ++++++ optimum/neuron/generation/utils.py | 4 ---- setup.py | 8 ++++---- text-generation-inference/Dockerfile | 13 +++++++------ .../server/text_generation_server/generator.py | 10 ++++++++-- 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 3412902f8..fa0aa7b77 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -351,3 +351,9 @@ def check_model_inputs_order( class GPT2NeuronConfig(TextNeuronDecoderConfig): NEURONX_ARGS = ["n_positions"] NEURONX_CLASS = "gpt2.model.GPT2ForSampling" + + +@register_in_tasks_manager("llama", "text-generation") +class LLamaNeuronConfig(TextNeuronDecoderConfig): + NEURONX_ARGS = ["n_positions"] + NEURONX_CLASS = "llama.model.LlamaForSampling" diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 39b773a34..67ba69fd9 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -504,10 +504,6 @@ def beam_search( else: next_token_logits = outputs.logits[:, -1, :] - # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `nn.functional.log_softmax` operation. - next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) - # Manually compute log softmax # log_softmax(vi) = vi - max(vi) - log(sum(exp(vi - max(vi)))) logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True) diff --git a/setup.py b/setup.py index f57b97e5b..4ba8cb5c6 100644 --- a/setup.py +++ b/setup.py @@ -53,12 +53,12 @@ ], "neuronx": [ "wheel", - "neuronx-cc==2.*", - "torch-neuronx", - "transformers-neuronx", + "neuronx-cc>=2.9", + "torch-neuronx>=1.13.1.1.10.1", + "transformers-neuronx>=0.6.106", "torch==1.13.1.*", "torchvision==0.14.*", - "neuronx_distributed >= 0.2.0", + "neuronx_distributed >= 0.3.0", ], "diffusers": ["diffusers"], } diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile index a3631faa5..047185f41 100644 --- a/text-generation-inference/Dockerfile +++ b/text-generation-inference/Dockerfile @@ -89,18 +89,19 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU # Install neuronx 2.12.2 packages RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ - aws-neuronx-dkms=2.11.9.0 \ - aws-neuronx-collectives=2.15.16.0-db4e2d9a9 \ - aws-neuronx-runtime-lib=2.15.14.0-279f319f2 \ - aws-neuronx-tools=2.12.2.0 \ + aws-neuronx-dkms=2.12.18.0 \ + aws-neuronx-collectives=2.16.16.0-e59c7bb3e \ + aws-neuronx-runtime-lib=2.16.14.0-61fdc395f \ + aws-neuronx-tools=2.13.4.0 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" RUN pip3 install \ - torch-neuronx==1.13.1.1.9.1 \ - transformers-neuronx==0.5.58 \ + neuronx-cc==2.9.0.40 \ + torch-neuronx==1.13.1.1.10.1 \ + transformers-neuronx==0.6.106 \ --extra-index-url=https://pip.repos.neuron.amazonaws.com # Install HuggingFace packages diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py index 18ff36891..49a97e189 100644 --- a/text-generation-inference/server/text_generation_server/generator.py +++ b/text-generation-inference/server/text_generation_server/generator.py @@ -207,7 +207,7 @@ def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.Lon Return: `torch.LongTensor`: A scalar torch.LongTensor` containing the selected token. """ - return self._selector.select(input_ids, logits) + return self._selector.select(input_ids, logits)[0] @property def stopped(self) -> bool: @@ -248,7 +248,7 @@ def info(self) -> InfoResponse: dtype = getattr(self.model.config, "torch_dtype", "float32") return InfoResponse( requires_padding=True, - dtype=dtype, + dtype=str(dtype), device_type="xla", ) @@ -370,6 +370,11 @@ def _generate_token( slot_input_ids = input_ids[i : i + 1, :] next_token = slot.select(slot_input_ids, next_token_logits) next_token_text = self.tokenizer.decode(next_token) + if not slot.generated_text.endswith(" ") and not next_token_text.startswith(" "): + # Some tokenizers do not prepend spaces automatically when decoding a single token + contextual_text = self.tokenizer.decode([slot.next_token, next_token]) + if contextual_text[: -len(next_token_text)].endswith(" "): + next_token_text = " " + next_token_text slot.append(next_token, next_token_text) generated_text = None finish_reason = None @@ -447,6 +452,7 @@ def from_pretrained( Args: model_id (`str`): The *model_id* of a model on the HuggingFace hub or the path to a local model. + In either case, the hub or local path must also contain a Tokenizer. revision (`str`): The revision of the model on the HuggingFace hub.