Optimize checkpointing (#505)

* fix(decoder): use model dtype when creating checkpoint This avoids checkpoint weights to be stored as float32. * fix(tgi): export model in one step When the model needs to be exported, using snapshot_download before export is not efficient as it fetches both pytorch and safetensors weigths. * fix(tools): styling
huggingface · Mar 6, 2024 · 249d0b6 · 249d0b6
1 parent 7649e6c
commit 249d0b6
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 13 deletions.
diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py
@@ -207,6 +207,7 @@ def _create_checkpoint(
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
+            torch_dtype="auto",
             **kwargs,
         )
 

diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py
@@ -98,17 +98,14 @@ def fetch_model(
     # Export the model
     logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
     start = time.time()
-    logger.info(f"Fetching revision {revision} of model {model_id}.")
-    model_path = snapshot_download(model_id, revision=revision)
-    end = time.time()
-    logger.info(f"Model successfully fetched in {end - start:.2f} s.")
     logger.info(f"Exporting model to neuron with config {neuron_config}.")
     start = time.time()
-    model = NeuronModelForCausalLM.from_pretrained(model_path, export=True, **export_kwargs)
-    # Save for later retrieval
-    model.save_pretrained(export_path)
+    model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **export_kwargs)
     end = time.time()
-    # We also need to fetch and save the tokenizer
+    logger.info(f"Model successfully exported in {end - start:.2f} s.")
+    logger.info(f"Saving exported model to local storage under {export_path}.")
+    model.save_pretrained(export_path)
+    logger.info(f"Saving model tokenizer under {export_path}.")
     tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
     tokenizer.save_pretrained(export_path)
     logger.info(f"Model successfully exported in {end - start:.2f} s under {export_path}.")

diff --git a/tools/auto_fill_inference_cache.py b/tools/auto_fill_inference_cache.py
@@ -13,18 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Script to cache models for inference."""
+import argparse
 import json
 import logging
-import os
+import re
 import subprocess
-import argparse
 import tempfile
 import time
+
+import requests
 from huggingface_hub import login
+
 from optimum.neuron import version as optimum_neuron_version
-import re
-import requests
-from requests.exceptions import HTTPError
+
 
 # Example usage:
 # huggingface-cli login --token hf_xxx # access to cache repo