From 249d0b6fdae20c0605e988c23625055bd398b6f4 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 6 Mar 2024 10:28:57 +0100
Subject: [PATCH] Optimize checkpointing (#505)

* fix(decoder): use model dtype when creating checkpoint

This avoids checkpoint weights to be stored as float32.

* fix(tgi): export model in one step

When the model needs to be exported, using snapshot_download before export is not
efficient as it fetches both pytorch and safetensors weigths.

* fix(tools): styling
---
 optimum/neuron/modeling_decoder.py                  |  1 +
 .../server/text_generation_server/model.py          | 13 +++++--------
 tools/auto_fill_inference_cache.py                  | 11 ++++++-----
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py
index e09b0a18e..10b9090a1 100644
--- a/optimum/neuron/modeling_decoder.py
+++ b/optimum/neuron/modeling_decoder.py
@@ -207,6 +207,7 @@ def _create_checkpoint(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
+            torch_dtype="auto",
             **kwargs,
         )
 
diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py
index 6a721d58c..6dadff43a 100644
--- a/text-generation-inference/server/text_generation_server/model.py
+++ b/text-generation-inference/server/text_generation_server/model.py
@@ -98,17 +98,14 @@ def fetch_model(
     # Export the model
     logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
     start = time.time()
-    logger.info(f"Fetching revision {revision} of model {model_id}.")
-    model_path = snapshot_download(model_id, revision=revision)
-    end = time.time()
-    logger.info(f"Model successfully fetched in {end - start:.2f} s.")
     logger.info(f"Exporting model to neuron with config {neuron_config}.")
     start = time.time()
-    model = NeuronModelForCausalLM.from_pretrained(model_path, export=True, **export_kwargs)
-    # Save for later retrieval
-    model.save_pretrained(export_path)
+    model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **export_kwargs)
     end = time.time()
-    # We also need to fetch and save the tokenizer
+    logger.info(f"Model successfully exported in {end - start:.2f} s.")
+    logger.info(f"Saving exported model to local storage under {export_path}.")
+    model.save_pretrained(export_path)
+    logger.info(f"Saving model tokenizer under {export_path}.")
     tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
     tokenizer.save_pretrained(export_path)
     logger.info(f"Model successfully exported in {end - start:.2f} s under {export_path}.")
diff --git a/tools/auto_fill_inference_cache.py b/tools/auto_fill_inference_cache.py
index 22b1a8185..8026e7e40 100644
--- a/tools/auto_fill_inference_cache.py
+++ b/tools/auto_fill_inference_cache.py
@@ -13,18 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Script to cache models for inference."""
+import argparse
 import json
 import logging
-import os
+import re
 import subprocess
-import argparse
 import tempfile
 import time
+
+import requests
 from huggingface_hub import login
+
 from optimum.neuron import version as optimum_neuron_version
-import re
-import requests
-from requests.exceptions import HTTPError
+
 
 # Example usage:
 # huggingface-cli login --token hf_xxx # access to cache repo