From 249d0b6fdae20c0605e988c23625055bd398b6f4 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 6 Mar 2024 10:28:57 +0100 Subject: [PATCH] Optimize checkpointing (#505) * fix(decoder): use model dtype when creating checkpoint This avoids checkpoint weights to be stored as float32. * fix(tgi): export model in one step When the model needs to be exported, using snapshot_download before export is not efficient as it fetches both pytorch and safetensors weigths. * fix(tools): styling --- optimum/neuron/modeling_decoder.py | 1 + .../server/text_generation_server/model.py | 13 +++++-------- tools/auto_fill_inference_cache.py | 11 ++++++----- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py index e09b0a18e..10b9090a1 100644 --- a/optimum/neuron/modeling_decoder.py +++ b/optimum/neuron/modeling_decoder.py @@ -207,6 +207,7 @@ def _create_checkpoint( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, + torch_dtype="auto", **kwargs, ) diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py index 6a721d58c..6dadff43a 100644 --- a/text-generation-inference/server/text_generation_server/model.py +++ b/text-generation-inference/server/text_generation_server/model.py @@ -98,17 +98,14 @@ def fetch_model( # Export the model logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.") start = time.time() - logger.info(f"Fetching revision {revision} of model {model_id}.") - model_path = snapshot_download(model_id, revision=revision) - end = time.time() - logger.info(f"Model successfully fetched in {end - start:.2f} s.") logger.info(f"Exporting model to neuron with config {neuron_config}.") start = time.time() - model = NeuronModelForCausalLM.from_pretrained(model_path, export=True, **export_kwargs) - # Save for later retrieval - model.save_pretrained(export_path) + model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **export_kwargs) end = time.time() - # We also need to fetch and save the tokenizer + logger.info(f"Model successfully exported in {end - start:.2f} s.") + logger.info(f"Saving exported model to local storage under {export_path}.") + model.save_pretrained(export_path) + logger.info(f"Saving model tokenizer under {export_path}.") tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) tokenizer.save_pretrained(export_path) logger.info(f"Model successfully exported in {end - start:.2f} s under {export_path}.") diff --git a/tools/auto_fill_inference_cache.py b/tools/auto_fill_inference_cache.py index 22b1a8185..8026e7e40 100644 --- a/tools/auto_fill_inference_cache.py +++ b/tools/auto_fill_inference_cache.py @@ -13,18 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. """Script to cache models for inference.""" +import argparse import json import logging -import os +import re import subprocess -import argparse import tempfile import time + +import requests from huggingface_hub import login + from optimum.neuron import version as optimum_neuron_version -import re -import requests -from requests.exceptions import HTTPError + # Example usage: # huggingface-cli login --token hf_xxx # access to cache repo