update ckpt conversion flow to use the new sharded ckpt path structure (

#332)
huggingface · Nov 21, 2023 · 99f080f · 99f080f
1 parent 1a1d801
commit 99f080f
Showing 1 changed file with 1 addition and 3 deletions.
diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py
@@ -38,13 +38,11 @@ def consolidate_tensor_parallel_checkpoints(checkpoint_dir: Union[str, Path]) ->
 
     state_dicts = []
 
-    for sharded_checkpoint in checkpoint_dir.glob("tp_rank_*"):
+    for sharded_checkpoint in sorted(checkpoint_dir.glob("tp_rank_*/checkpoint.pt")):
         if not sharded_checkpoint.is_file():
             continue
         state_dicts.append(torch.load(sharded_checkpoint))
 
-    state_dicts = sorted(state_dicts, key=lambda d: d["tp_rank"])
-
     parameter_names = state_dicts[0]["model"].keys()
     sharded_metadatas = {
         name: [