Skip to content

Commit

Permalink
Merge branch 'main' into update_docs
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun committed May 3, 2024
2 parents ece8829 + 18460aa commit 2863da9
Show file tree
Hide file tree
Showing 40 changed files with 455 additions and 258 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/test_inf2_tgi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ jobs:
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Install integration tests prerequisites
run: |
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -r text-generation-inference/tests/requirements.txt
- name: Run TGI server python tests
run: |
# gawk is required when invoking the Makefile targets
Expand All @@ -55,12 +59,8 @@ jobs:
run: |
source aws_neuron_venv_pytorch/bin/activate
make neuronx-tgi
- name: Install integration tests prerequisites
run: |
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -r text-generation-inference/integration-tests/requirements.txt
- name: Run TGI docker tests
shell: bash
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} python -m pytest -sv text-generation-inference/integration-tests
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} python -m pytest -sv text-generation-inference/tests -k integration
11 changes: 6 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES) \
$(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
python -m build

TGI_VERSION ?= 1.4.4
TGI_VERSION ?= 2.0.1

neuronx-tgi: $(PACKAGE_DIST)
docker build --rm -f text-generation-inference/Dockerfile \
Expand Down Expand Up @@ -93,11 +93,12 @@ tgi_server:
VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server

tgi_test: tgi_server
python -m pip install .[neuronx] pytest
python -m pip install .[neuronx]
python -m pip install -r text-generation-inference/tests/requirements.txt
find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
-exec python -m pip install --force-reinstall {} \;
python -m pytest -s text-generation-inference/tests
python -m pytest -sv text-generation-inference/tests -k server

tgi_docker_test: neuronx-tgi
python -m pip install -r text-generation-inference/integration-tests/requirements.txt
python -m pytest -s text-generation-inference/integration-tests
python -m pip install -r text-generation-inference/tests/requirements.txt
python -m pytest -sv text-generation-inference/tests -k integration
Binary file added docs/assets/guides/models/03-sd-lora.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
47 changes: 47 additions & 0 deletions docs/source/inference_tutorials/stable_diffusion.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -469,4 +469,51 @@ Inf2 instances contain one or more Neuron devices, and each Neuron device includ

</Tip>


## Load adapters

### LoRA

Low-Rank Adaptation is fast way to Stable Diffusion to adapt styles of generated images. In Optimum Neuron, we support using one or multiple LoRA adapters by fusing their parameters into the original parameters of the text encoder(s) and the unet during the compilation. Here below is an example of compiling stable diffusion models with LoRA adapters of your choice and using the compiled artifacts to generate styled images:

```python

from diffusers import LCMScheduler
from optimum.neuron import NeuronStableDiffusionPipeline


model_id = "Lykon/dreamshaper-7"
adapter_id = "latent-consistency/lcm-lora-sdv1-5"
input_shapes = {"batch_size": 1, "height": 512, "width": 512, "num_images_per_prompt": 1}
compiler_args = {"auto_cast": "matmul", "auto_cast_type": "bf16"}

# Compile
pipe = NeuronStableDiffusionPipeline.from_pretrained(
model_id,
export=True,
inline_weights_to_neff=True, # caveat: performance drop if neff/weights separated, will be improved by a future Neuron sdk release.
lora_model_ids=adapter_id,
lora_weight_names="pytorch_lora_weights.safetensors",
lora_adapter_names="lcm",
**input_shapes,
**compiler_args,
)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

# Save locally or upload to the HuggingFace Hub
pipe.save_pretrained("dreamshaper_7_lcm_lora_neuron/")


# Inference
prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
image = pipe(prompt, num_inference_steps=4, guidance_scale=0).images[0]
```

<img
src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/guides/models/03-sd-lora.png"
width="256"
height="256"
alt="stable diffusion generated image with LoRA adapter."
/>

Are there any other stable diffusion features that you want us to support in 🤗`Optimum-neuron`? Please file an issue to [`Optimum-neuron` Github repo](https://github.com/huggingface/optimum-neuron) or discuss with us on [HuggingFace’s community forum](https://discuss.huggingface.co/c/optimum/), cheers 🤗 !
2 changes: 1 addition & 1 deletion docs/source/quickstart.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ torchrun --nproc_per_node=2 huggingface-neuron-samples/text-classification/run_g
You can compile and export your 🤗 Transformers models to a serialized format before inference on Neuron devices:

```bash
optimum-cli export neuron
optimum-cli export neuron \
--model distilbert-base-uncased-finetuned-sst-2-english \
--batch_size 1 \
--sequence_length 32 \
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ami/hcl2-files/variables.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ variable "instance_type" {
}

variable "source_ami" {
default = "ami-0da38db779978a5f7"
default = "ami-0274e546d67626305"
description = "Base Image"
type = string
/*
Expand Down
9 changes: 5 additions & 4 deletions optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,17 +264,16 @@ def get_submodels_and_neuron_configs(

if is_stable_diffusion:
# TODO: Enable optional outputs for Stable Diffusion
if output_attentions or output_hidden_states:
raise ValueError(
f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet."
)
if output_attentions:
raise ValueError(f"`output_attentions`is not supported by the {task} task yet.")
models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion(
model=model,
input_shapes=input_shapes,
task=task,
output=output,
dynamic_batch_size=dynamic_batch_size,
submodels=submodels,
output_hidden_states=output_hidden_states,
lora_model_ids=lora_model_ids,
lora_weight_names=lora_weight_names,
lora_adapter_names=lora_adapter_names,
Expand Down Expand Up @@ -334,6 +333,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
output: Path,
dynamic_batch_size: bool = False,
submodels: Optional[Dict[str, Union[Path, str]]] = None,
output_hidden_states: bool = False,
lora_model_ids: Optional[Union[str, List[str]]] = None,
lora_weight_names: Optional[Union[str, List[str]]] = None,
lora_adapter_names: Optional[Union[str, List[str]]] = None,
Expand Down Expand Up @@ -368,6 +368,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
vae_encoder_input_shapes=input_shapes["vae_encoder"],
vae_decoder_input_shapes=input_shapes["vae_decoder"],
dynamic_batch_size=dynamic_batch_size,
output_hidden_states=output_hidden_states,
lora_model_ids=lora_model_ids,
lora_weight_names=lora_weight_names,
lora_adapter_names=lora_adapter_names,
Expand Down
8 changes: 6 additions & 2 deletions optimum/exporters/neuron/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ def patch_model_for_export(
Checks if inputs order of the model's forward pass correspond to the generated dummy inputs to ensure the dummy inputs tuple used for
tracing are under the correct order.
"""
output_hidden_states = self.output_hidden_states

class ModelWrapper(torch.nn.Module):
def __init__(self, model: "PreTrainedModel", input_names: List[str]):
Expand All @@ -355,10 +356,13 @@ def forward(self, *input):
if forward_with_tuple is True:
outputs = self.model(*ordered_inputs.values())
else:
if output_hidden_states:
ordered_inputs["output_hidden_states"] = True
outputs = self.model(**ordered_inputs)

if isinstance(outputs, dict) and eligible_outputs is not None:
outputs = {name: outputs[name] for name in outputs.keys() & eligible_outputs}
if isinstance(outputs, dict):
if eligible_outputs is not None:
outputs = {name: outputs[name] for name in outputs.keys() & eligible_outputs}

if isinstance(outputs, tuple) and eligible_outputs is not None:
if not all(isinstance(x, int) for x in eligible_outputs):
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/neuron/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def outputs(self) -> List[str]:

@register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="diffusers")
class CLIPTextWithProjectionNeuronConfig(TextEncoderNeuronConfig):
MODEL_TYPE = "clip-text-model"
MODEL_TYPE = "clip-text-with-projection"
ATOL_FOR_VALIDATION = 1e-3

NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
Expand Down
14 changes: 11 additions & 3 deletions optimum/exporters/neuron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def get_stable_diffusion_models_for_export(
vae_encoder_input_shapes: Dict[str, int],
vae_decoder_input_shapes: Dict[str, int],
dynamic_batch_size: Optional[bool] = False,
output_hidden_states: bool = False,
lora_model_ids: Optional[List[str]] = None,
lora_weight_names: Optional[List[str]] = None,
lora_adapter_names: Optional[List[str]] = None,
Expand All @@ -147,6 +148,8 @@ def get_stable_diffusion_models_for_export(
Static shapes used for compiling vae decoder.
dynamic_batch_size (`bool`, defaults to `False`):
Whether the Neuron compiled model supports dynamic batch size.
output_hidden_states (`bool`, defaults to `False`):
Whether or not for the traced text encoders to return the hidden states of all layers.
lora_model_ids (`Optional[List[str]]`, defaults to `None`):
List of model ids (eg. `ostris/super-cereal-sdxl-lora`) of pretrained lora models hosted on the Hub or paths to local directories containing the lora weights.
lora_weight_names (`Optional[List[str]]`, defaults to `None`):
Expand Down Expand Up @@ -183,6 +186,7 @@ def get_stable_diffusion_models_for_export(
text_encoder.config,
task="feature-extraction",
dynamic_batch_size=dynamic_batch_size,
output_hidden_states=output_hidden_states,
**text_encoder_input_shapes,
)
models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, text_encoder_neuron_config)
Expand All @@ -200,6 +204,7 @@ def get_stable_diffusion_models_for_export(
text_encoder_2.config,
task="feature-extraction",
dynamic_batch_size=dynamic_batch_size,
output_hidden_states=output_hidden_states,
**text_encoder_input_shapes,
)
models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = (text_encoder_2, text_encoder_neuron_config_2)
Expand Down Expand Up @@ -287,7 +292,7 @@ def _load_lora_weights_to_pipeline(
if len(lora_model_ids) == 1:
pipeline.load_lora_weights(lora_model_ids[0], weight_name=weight_names[0])
# For tracing the lora weights, we need to use PEFT to fuse adapters directly into the model weights. It won't work by passing the lora scale to the Neuron pipeline during the inference.
pipeline.fuse_lora(lora_scale=lora_scales[0])
pipeline.fuse_lora(lora_scale=lora_scales[0] if lora_scales else 1.0)
elif len(lora_model_ids) > 1:
if not len(lora_model_ids) == len(weight_names) == len(adapter_names):
raise ValueError(
Expand All @@ -300,10 +305,13 @@ def _load_lora_weights_to_pipeline(
pipeline.set_adapters(adapter_names, adapter_weights=lora_scales)
pipeline.fuse_lora()

return pipeline


def get_submodels_for_export_stable_diffusion(
pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"],
task: str,
output_hidden_states: bool = False,
lora_model_ids: Optional[Union[str, List[str]]] = None,
lora_weight_names: Optional[Union[str, List[str]]] = None,
lora_adapter_names: Optional[Union[str, List[str]]] = None,
Expand All @@ -314,7 +322,7 @@ def get_submodels_for_export_stable_diffusion(
"""
is_sdxl = "xl" in task

_load_lora_weights_to_pipeline(
pipeline = _load_lora_weights_to_pipeline(
pipeline=pipeline,
lora_model_ids=lora_model_ids,
weight_names=lora_weight_names,
Expand All @@ -330,7 +338,7 @@ def get_submodels_for_export_stable_diffusion(

# Text encoders
if pipeline.text_encoder is not None:
if is_sdxl:
if is_sdxl or output_hidden_states:
pipeline.text_encoder.config.output_hidden_states = True
models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_NAME, copy.deepcopy(pipeline.text_encoder)))

Expand Down
19 changes: 10 additions & 9 deletions optimum/neuron/generation/token_selector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import copy
import logging
from typing import Optional
from typing import List, Optional

import torch
from transformers.generation import (
Expand Down Expand Up @@ -41,15 +41,15 @@ def __init__(
mode: GenerationMode,
logits_processor: LogitsProcessorList,
stopping_criteria: StoppingCriteriaList,
eos_token_id: int,
eos_token_ids: List[int],
pad_token_id: int,
logits_warper: Optional[LogitsProcessorList] = None,
seed: Optional[int] = 0,
):
self.mode = mode
self.logits_processor = logits_processor
self.stopping_criteria = stopping_criteria
self.eos_token_id = eos_token_id
self.eos_token_ids = eos_token_ids
self.pad_token_id = pad_token_id
self.logits_warper = logits_warper
self.generator = torch.Generator()
Expand Down Expand Up @@ -130,13 +130,14 @@ def create(
stopping_criteria = StoppingCriteriaList()
stopping_criteria = model._get_stopping_criteria(generation_config, stopping_criteria=stopping_criteria)

# The generation requires special tokens
eos_token_id = generation_config.eos_token_id
# This is not supposed to happen for any of the models we support
assert eos_token_id is not None and not isinstance(eos_token_id, list)
eos_token_id = generation_config.eos_token_id
assert eos_token_id is not None
# The generation requires special tokens
eos_token_ids = eos_token_id if isinstance(eos_token_id, list) else [eos_token_id]
if generation_config.pad_token_id is None:
logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
generation_config.pad_token_id = eos_token_id
logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_ids[0]} for open-ended generation.")
generation_config.pad_token_id = eos_token_ids[0]

generation_mode = model._get_generation_mode(generation_config, None)
if generation_mode not in [GenerationMode.GREEDY_SEARCH, GenerationMode.SAMPLE]:
Expand All @@ -151,7 +152,7 @@ def create(
logits_processor=logits_processor,
stopping_criteria=stopping_criteria,
logits_warper=logits_warper,
eos_token_id=eos_token_id,
eos_token_ids=eos_token_ids,
pad_token_id=generation_config.pad_token_id,
seed=seed,
)
Expand Down
12 changes: 10 additions & 2 deletions optimum/neuron/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,13 @@ def generate(
elif batch_size < self.batch_size and not self.continuous_batching:
logger.warning("Inputs will be padded to match the model static batch size. This will increase latency.")
padding_shape = [self.batch_size - batch_size, sequence_length]
padding = torch.full(padding_shape, fill_value=self.config.eos_token_id, dtype=torch.int64)
pad_token_id = generation_config.pad_token_id
if pad_token_id is None:
if isinstance(self.config.eos_token_id, list):
pad_token_id = self.config.eos_token_id[0]
else:
pad_token_id = self.config.eos_token_id
padding = torch.full(padding_shape, fill_value=pad_token_id, dtype=torch.int64)
padded_input_ids = torch.cat([padded_input_ids, padding])
padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([padded_attention_mask, padding])
Expand Down Expand Up @@ -908,7 +914,9 @@ def generate_tokens(
attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)

# if eos_token was found in one sentence, set sentence to finished
unfinished_sequences = unfinished_sequences * next_tokens.ne(selector.eos_token_id)
unfinished_sequences = unfinished_sequences * torch.isin(
next_tokens, torch.tensor(selector.eos_token_ids), invert=True
)

# stop when each sentence is finished
if unfinished_sequences.max() == 0:
Expand Down
17 changes: 12 additions & 5 deletions optimum/neuron/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@


def get_exporter(config, task):
return TasksManager.get_exporter_config_constructor(model_type=config.model_type, exporter="neuron", task=task)()
return TasksManager.get_exporter_config_constructor(
model_type=config.model_type, exporter="neuron", task=task, library_name="transformers"
)()


# Note: with python 3.9, functools.cache would be more suited
Expand Down Expand Up @@ -281,8 +283,13 @@ def get_export_config(
batch_size = 1
# If the sequence_length was not specified, deduce it from the model configuration
if sequence_length is None:
# Note: for older models, max_position_embeddings is an alias for n_positions
sequence_length = config.max_position_embeddings
if hasattr(config, "n_positions"):
sequence_length = config.n_positions
elif hasattr(config, "max_position_embeddings"):
sequence_length = config.max_position_embeddings
else:
# Use transformers-neuronx default
sequence_length = 2048
if num_cores is None:
# Use all available cores
num_cores = get_available_cores()
Expand Down Expand Up @@ -355,7 +362,7 @@ def _export(
# Try to reload the generation config (if any)
generation_config = None
try:
generation_config = GenerationConfig.from_pretrained(model_id)
generation_config = GenerationConfig.from_pretrained(model_id, revision=revision)
except OSError:
pass

Expand Down Expand Up @@ -412,7 +419,7 @@ def _from_pretrained(
# Try to reload the generation config (if any)
generation_config = None
try:
generation_config = GenerationConfig.from_pretrained(model_id)
generation_config = GenerationConfig.from_pretrained(model_id, revision=revision)
except OSError:
pass

Expand Down
Loading

0 comments on commit 2863da9

Please sign in to comment.