Skip to content

Commit

Permalink
Optimize checkpointing (#505)
Browse files Browse the repository at this point in the history
* fix(decoder): use model dtype when creating checkpoint

This avoids checkpoint weights to be stored as float32.

* fix(tgi): export model in one step

When the model needs to be exported, using snapshot_download before export is not
efficient as it fetches both pytorch and safetensors weigths.

* fix(tools): styling
  • Loading branch information
dacorvo authored Mar 6, 2024
1 parent 7649e6c commit 249d0b6
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 13 deletions.
1 change: 1 addition & 0 deletions optimum/neuron/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def _create_checkpoint(
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
torch_dtype="auto",
**kwargs,
)

Expand Down
13 changes: 5 additions & 8 deletions text-generation-inference/server/text_generation_server/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,14 @@ def fetch_model(
# Export the model
logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
start = time.time()
logger.info(f"Fetching revision {revision} of model {model_id}.")
model_path = snapshot_download(model_id, revision=revision)
end = time.time()
logger.info(f"Model successfully fetched in {end - start:.2f} s.")
logger.info(f"Exporting model to neuron with config {neuron_config}.")
start = time.time()
model = NeuronModelForCausalLM.from_pretrained(model_path, export=True, **export_kwargs)
# Save for later retrieval
model.save_pretrained(export_path)
model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **export_kwargs)
end = time.time()
# We also need to fetch and save the tokenizer
logger.info(f"Model successfully exported in {end - start:.2f} s.")
logger.info(f"Saving exported model to local storage under {export_path}.")
model.save_pretrained(export_path)
logger.info(f"Saving model tokenizer under {export_path}.")
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
tokenizer.save_pretrained(export_path)
logger.info(f"Model successfully exported in {end - start:.2f} s under {export_path}.")
Expand Down
11 changes: 6 additions & 5 deletions tools/auto_fill_inference_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Script to cache models for inference."""
import argparse
import json
import logging
import os
import re
import subprocess
import argparse
import tempfile
import time

import requests
from huggingface_hub import login

from optimum.neuron import version as optimum_neuron_version
import re
import requests
from requests.exceptions import HTTPError


# Example usage:
# huggingface-cli login --token hf_xxx # access to cache repo
Expand Down

0 comments on commit 249d0b6

Please sign in to comment.