Merge branch 'main' into update_docs

huggingface · May 3, 2024 · 2863da9 · 2863da9
2 parents ece8829 + 18460aa
commit 2863da9
Show file tree

Hide file tree

Showing 40 changed files with 455 additions and 258 deletions.
diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -44,6 +44,10 @@ jobs:
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
           python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+      - name: Install integration tests prerequisites
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -r text-generation-inference/tests/requirements.txt
       - name: Run TGI server python tests
         run: |
           # gawk is required when invoking the Makefile targets
@@ -55,12 +59,8 @@ jobs:
         run: |
           source aws_neuron_venv_pytorch/bin/activate
           make neuronx-tgi
-      - name: Install integration tests prerequisites
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          python -m pip install -r text-generation-inference/integration-tests/requirements.txt
       - name: Run TGI docker tests
         shell: bash
         run: |
           source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} python -m pytest -sv text-generation-inference/integration-tests
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} python -m pytest -sv text-generation-inference/tests -k integration
diff --git a/Makefile b/Makefile
@@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
-TGI_VERSION ?= 1.4.4
+TGI_VERSION ?= 2.0.1
 
 neuronx-tgi: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \
@@ -93,11 +93,12 @@ tgi_server:
 	VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server
 
 tgi_test: tgi_server
-	python -m pip install .[neuronx] pytest
+	python -m pip install .[neuronx]
+	python -m pip install -r text-generation-inference/tests/requirements.txt
 	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
 	                               -exec python -m pip install --force-reinstall {} \;
-	python -m pytest -s text-generation-inference/tests
+	python -m pytest -sv text-generation-inference/tests -k server
 
 tgi_docker_test: neuronx-tgi
-	python -m pip install -r text-generation-inference/integration-tests/requirements.txt
-	python -m pytest -s text-generation-inference/integration-tests
+	python -m pip install -r text-generation-inference/tests/requirements.txt
+	python -m pytest -sv text-generation-inference/tests -k integration
diff --git a/docs/assets/guides/models/03-sd-lora.png b/docs/assets/guides/models/03-sd-lora.png
diff --git a/docs/source/inference_tutorials/stable_diffusion.mdx b/docs/source/inference_tutorials/stable_diffusion.mdx
@@ -469,4 +469,51 @@ Inf2 instances contain one or more Neuron devices, and each Neuron device includ
 
 </Tip>
 
+
+## Load adapters
+
+### LoRA
+
+Low-Rank Adaptation is fast way to Stable Diffusion to adapt styles of generated images. In Optimum Neuron, we support using one or multiple LoRA adapters by fusing their parameters into the original parameters of the text encoder(s) and the unet during the compilation. Here below is an example of compiling stable diffusion models with LoRA adapters of your choice and using the compiled artifacts to generate styled images:
+
+```python
+
+from diffusers import LCMScheduler
+from optimum.neuron import NeuronStableDiffusionPipeline
+
+
+model_id = "Lykon/dreamshaper-7"
+adapter_id = "latent-consistency/lcm-lora-sdv1-5"
+input_shapes = {"batch_size": 1, "height": 512, "width": 512, "num_images_per_prompt": 1}
+compiler_args = {"auto_cast": "matmul", "auto_cast_type": "bf16"}
+
+# Compile
+pipe = NeuronStableDiffusionPipeline.from_pretrained(
+    model_id,
+    export=True,
+    inline_weights_to_neff=True,  # caveat: performance drop if neff/weights separated, will be improved by a future Neuron sdk release.
+    lora_model_ids=adapter_id,
+    lora_weight_names="pytorch_lora_weights.safetensors",
+    lora_adapter_names="lcm",
+    **input_shapes,
+    **compiler_args,
+)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# Save locally or upload to the HuggingFace Hub
+pipe.save_pretrained("dreamshaper_7_lcm_lora_neuron/")
+
+
+# Inference
+prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+image = pipe(prompt, num_inference_steps=4, guidance_scale=0).images[0]
+```
+
+<img
+  src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/guides/models/03-sd-lora.png"
+  width="256"
+  height="256"
+  alt="stable diffusion generated image with LoRA adapter."
+/>
+
 Are there any other stable diffusion features that you want us to support in 🤗`Optimum-neuron`? Please file an issue to [`Optimum-neuron` Github repo](https://github.com/huggingface/optimum-neuron) or discuss with us on [HuggingFace’s community forum](https://discuss.huggingface.co/c/optimum/), cheers 🤗 !
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
@@ -71,7 +71,7 @@ torchrun --nproc_per_node=2 huggingface-neuron-samples/text-classification/run_g
 You can compile and export your 🤗 Transformers models to a serialized format before inference on Neuron devices:
 
 ```bash
-optimum-cli export neuron 
+optimum-cli export neuron \
   --model distilbert-base-uncased-finetuned-sst-2-english \
   --batch_size 1 \
   --sequence_length 32 \

diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl
@@ -10,7 +10,7 @@ variable "instance_type" {
 }
 
 variable "source_ami" {
-  default     = "ami-0da38db779978a5f7" 
+  default     = "ami-0274e546d67626305"
   description = "Base Image"
   type        = string
   /*

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -264,17 +264,16 @@ def get_submodels_and_neuron_configs(
 
     if is_stable_diffusion:
         # TODO: Enable optional outputs for Stable Diffusion
-        if output_attentions or output_hidden_states:
-            raise ValueError(
-                f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet."
-            )
+        if output_attentions:
+            raise ValueError(f"`output_attentions`is not supported by the {task} task yet.")
         models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion(
             model=model,
             input_shapes=input_shapes,
             task=task,
             output=output,
             dynamic_batch_size=dynamic_batch_size,
             submodels=submodels,
+            output_hidden_states=output_hidden_states,
             lora_model_ids=lora_model_ids,
             lora_weight_names=lora_weight_names,
             lora_adapter_names=lora_adapter_names,
@@ -334,6 +333,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     output: Path,
     dynamic_batch_size: bool = False,
     submodels: Optional[Dict[str, Union[Path, str]]] = None,
+    output_hidden_states: bool = False,
     lora_model_ids: Optional[Union[str, List[str]]] = None,
     lora_weight_names: Optional[Union[str, List[str]]] = None,
     lora_adapter_names: Optional[Union[str, List[str]]] = None,
@@ -368,6 +368,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
         vae_encoder_input_shapes=input_shapes["vae_encoder"],
         vae_decoder_input_shapes=input_shapes["vae_decoder"],
         dynamic_batch_size=dynamic_batch_size,
+        output_hidden_states=output_hidden_states,
         lora_model_ids=lora_model_ids,
         lora_weight_names=lora_weight_names,
         lora_adapter_names=lora_adapter_names,

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
@@ -336,6 +336,7 @@ def patch_model_for_export(
         Checks if inputs order of the model's forward pass correspond to the generated dummy inputs to ensure the dummy inputs tuple used for
         tracing are under the correct order.
         """
+        output_hidden_states = self.output_hidden_states
 
         class ModelWrapper(torch.nn.Module):
             def __init__(self, model: "PreTrainedModel", input_names: List[str]):
@@ -355,10 +356,13 @@ def forward(self, *input):
                 if forward_with_tuple is True:
                     outputs = self.model(*ordered_inputs.values())
                 else:
+                    if output_hidden_states:
+                        ordered_inputs["output_hidden_states"] = True
                     outputs = self.model(**ordered_inputs)
 
-                if isinstance(outputs, dict) and eligible_outputs is not None:
-                    outputs = {name: outputs[name] for name in outputs.keys() & eligible_outputs}
+                if isinstance(outputs, dict):
+                    if eligible_outputs is not None:
+                        outputs = {name: outputs[name] for name in outputs.keys() & eligible_outputs}
 
                 if isinstance(outputs, tuple) and eligible_outputs is not None:
                     if not all(isinstance(x, int) for x in eligible_outputs):

diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
@@ -244,7 +244,7 @@ def outputs(self) -> List[str]:
 
 @register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="diffusers")
 class CLIPTextWithProjectionNeuronConfig(TextEncoderNeuronConfig):
-    MODEL_TYPE = "clip-text-model"
+    MODEL_TYPE = "clip-text-with-projection"
     ATOL_FOR_VALIDATION = 1e-3
 
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(

diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
@@ -121,6 +121,7 @@ def get_stable_diffusion_models_for_export(
     vae_encoder_input_shapes: Dict[str, int],
     vae_decoder_input_shapes: Dict[str, int],
     dynamic_batch_size: Optional[bool] = False,
+    output_hidden_states: bool = False,
     lora_model_ids: Optional[List[str]] = None,
     lora_weight_names: Optional[List[str]] = None,
     lora_adapter_names: Optional[List[str]] = None,
@@ -147,6 +148,8 @@ def get_stable_diffusion_models_for_export(
             Static shapes used for compiling vae decoder.
         dynamic_batch_size (`bool`, defaults to `False`):
             Whether the Neuron compiled model supports dynamic batch size.
+        output_hidden_states (`bool`, defaults to `False`):
+            Whether or not for the traced text encoders to return the hidden states of all layers.
         lora_model_ids (`Optional[List[str]]`, defaults to `None`):
             List of model ids (eg. `ostris/super-cereal-sdxl-lora`) of pretrained lora models hosted on the Hub or paths to local directories containing the lora weights.
         lora_weight_names (`Optional[List[str]]`, defaults to `None`):
@@ -183,6 +186,7 @@ def get_stable_diffusion_models_for_export(
             text_encoder.config,
             task="feature-extraction",
             dynamic_batch_size=dynamic_batch_size,
+            output_hidden_states=output_hidden_states,
             **text_encoder_input_shapes,
         )
         models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, text_encoder_neuron_config)
@@ -200,6 +204,7 @@ def get_stable_diffusion_models_for_export(
             text_encoder_2.config,
             task="feature-extraction",
             dynamic_batch_size=dynamic_batch_size,
+            output_hidden_states=output_hidden_states,
             **text_encoder_input_shapes,
         )
         models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = (text_encoder_2, text_encoder_neuron_config_2)
@@ -287,7 +292,7 @@ def _load_lora_weights_to_pipeline(
         if len(lora_model_ids) == 1:
             pipeline.load_lora_weights(lora_model_ids[0], weight_name=weight_names[0])
             # For tracing the lora weights, we need to use PEFT to fuse adapters directly into the model weights. It won't work by passing the lora scale to the Neuron pipeline during the inference.
-            pipeline.fuse_lora(lora_scale=lora_scales[0])
+            pipeline.fuse_lora(lora_scale=lora_scales[0] if lora_scales else 1.0)
         elif len(lora_model_ids) > 1:
             if not len(lora_model_ids) == len(weight_names) == len(adapter_names):
                 raise ValueError(
@@ -300,10 +305,13 @@ def _load_lora_weights_to_pipeline(
                 pipeline.set_adapters(adapter_names, adapter_weights=lora_scales)
             pipeline.fuse_lora()
 
+    return pipeline
+
 
 def get_submodels_for_export_stable_diffusion(
     pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"],
     task: str,
+    output_hidden_states: bool = False,
     lora_model_ids: Optional[Union[str, List[str]]] = None,
     lora_weight_names: Optional[Union[str, List[str]]] = None,
     lora_adapter_names: Optional[Union[str, List[str]]] = None,
@@ -314,7 +322,7 @@ def get_submodels_for_export_stable_diffusion(
     """
     is_sdxl = "xl" in task
 
-    _load_lora_weights_to_pipeline(
+    pipeline = _load_lora_weights_to_pipeline(
         pipeline=pipeline,
         lora_model_ids=lora_model_ids,
         weight_names=lora_weight_names,
@@ -330,7 +338,7 @@ def get_submodels_for_export_stable_diffusion(
 
     # Text encoders
     if pipeline.text_encoder is not None:
-        if is_sdxl:
+        if is_sdxl or output_hidden_states:
             pipeline.text_encoder.config.output_hidden_states = True
         models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_NAME, copy.deepcopy(pipeline.text_encoder)))
 

diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py
@@ -1,6 +1,6 @@
 import copy
 import logging
-from typing import Optional
+from typing import List, Optional
 
 import torch
 from transformers.generation import (
@@ -41,15 +41,15 @@ def __init__(
         mode: GenerationMode,
         logits_processor: LogitsProcessorList,
         stopping_criteria: StoppingCriteriaList,
-        eos_token_id: int,
+        eos_token_ids: List[int],
         pad_token_id: int,
         logits_warper: Optional[LogitsProcessorList] = None,
         seed: Optional[int] = 0,
     ):
         self.mode = mode
         self.logits_processor = logits_processor
         self.stopping_criteria = stopping_criteria
-        self.eos_token_id = eos_token_id
+        self.eos_token_ids = eos_token_ids
         self.pad_token_id = pad_token_id
         self.logits_warper = logits_warper
         self.generator = torch.Generator()
@@ -130,13 +130,14 @@ def create(
             stopping_criteria = StoppingCriteriaList()
         stopping_criteria = model._get_stopping_criteria(generation_config, stopping_criteria=stopping_criteria)
 
-        # The generation requires special tokens
-        eos_token_id = generation_config.eos_token_id
         # This is not supposed to happen for any of the models we support
-        assert eos_token_id is not None and not isinstance(eos_token_id, list)
+        eos_token_id = generation_config.eos_token_id
+        assert eos_token_id is not None
+        # The generation requires special tokens
+        eos_token_ids = eos_token_id if isinstance(eos_token_id, list) else [eos_token_id]
         if generation_config.pad_token_id is None:
-            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
-            generation_config.pad_token_id = eos_token_id
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_ids[0]} for open-ended generation.")
+            generation_config.pad_token_id = eos_token_ids[0]
 
         generation_mode = model._get_generation_mode(generation_config, None)
         if generation_mode not in [GenerationMode.GREEDY_SEARCH, GenerationMode.SAMPLE]:
@@ -151,7 +152,7 @@ def create(
             logits_processor=logits_processor,
             stopping_criteria=stopping_criteria,
             logits_warper=logits_warper,
-            eos_token_id=eos_token_id,
+            eos_token_ids=eos_token_ids,
             pad_token_id=generation_config.pad_token_id,
             seed=seed,
         )

diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py
@@ -842,7 +842,13 @@ def generate(
         elif batch_size < self.batch_size and not self.continuous_batching:
             logger.warning("Inputs will be padded to match the model static batch size. This will increase latency.")
             padding_shape = [self.batch_size - batch_size, sequence_length]
-            padding = torch.full(padding_shape, fill_value=self.config.eos_token_id, dtype=torch.int64)
+            pad_token_id = generation_config.pad_token_id
+            if pad_token_id is None:
+                if isinstance(self.config.eos_token_id, list):
+                    pad_token_id = self.config.eos_token_id[0]
+                else:
+                    pad_token_id = self.config.eos_token_id
+            padding = torch.full(padding_shape, fill_value=pad_token_id, dtype=torch.int64)
             padded_input_ids = torch.cat([padded_input_ids, padding])
             padding = torch.zeros(padding_shape, dtype=torch.int64)
             padded_attention_mask = torch.cat([padded_attention_mask, padding])
@@ -908,7 +914,9 @@ def generate_tokens(
             attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
 
             # if eos_token was found in one sentence, set sentence to finished
-            unfinished_sequences = unfinished_sequences * next_tokens.ne(selector.eos_token_id)
+            unfinished_sequences = unfinished_sequences * torch.isin(
+                next_tokens, torch.tensor(selector.eos_token_ids), invert=True
+            )
 
             # stop when each sentence is finished
             if unfinished_sequences.max() == 0:

diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py
@@ -52,7 +52,9 @@
 
 
 def get_exporter(config, task):
-    return TasksManager.get_exporter_config_constructor(model_type=config.model_type, exporter="neuron", task=task)()
+    return TasksManager.get_exporter_config_constructor(
+        model_type=config.model_type, exporter="neuron", task=task, library_name="transformers"
+    )()
 
 
 # Note: with python 3.9, functools.cache would be more suited
@@ -281,8 +283,13 @@ def get_export_config(
             batch_size = 1
         # If the sequence_length was not specified, deduce it from the model configuration
         if sequence_length is None:
-            # Note: for older models, max_position_embeddings is an alias for n_positions
-            sequence_length = config.max_position_embeddings
+            if hasattr(config, "n_positions"):
+                sequence_length = config.n_positions
+            elif hasattr(config, "max_position_embeddings"):
+                sequence_length = config.max_position_embeddings
+            else:
+                # Use transformers-neuronx default
+                sequence_length = 2048
         if num_cores is None:
             # Use all available cores
             num_cores = get_available_cores()
@@ -355,7 +362,7 @@ def _export(
         # Try to reload the generation config (if any)
         generation_config = None
         try:
-            generation_config = GenerationConfig.from_pretrained(model_id)
+            generation_config = GenerationConfig.from_pretrained(model_id, revision=revision)
         except OSError:
             pass
 
@@ -412,7 +419,7 @@ def _from_pretrained(
         # Try to reload the generation config (if any)
         generation_config = None
         try:
-            generation_config = GenerationConfig.from_pretrained(model_id)
+            generation_config = GenerationConfig.from_pretrained(model_id, revision=revision)
         except OSError:
             pass