Skip to content

Commit

Permalink
[Bug] Fix crash when logging optimizer state to tb (#417)
Browse files Browse the repository at this point in the history
  • Loading branch information
billishyahao authored Aug 27, 2024
1 parent 543543a commit 1280f59
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
if args.log_optimizer_states_to_tensorboard and optimizer is not None:
opt_stats = [0.0] * 8
opt_stats_2 = [0.0] * 4

#TODO(billishyahao): Remove me after bf16_optimizer promotes its state.
if not hasattr(optimizer, "state"):
assert hasattr(optimizer, "optimizer"), f"Optimizer must have optimizer property."
optimizer.state = optimizer.optimizer.state

for _, group in enumerate(optimizer.param_groups):
for _, param in enumerate(group['params']):
opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2
Expand Down

0 comments on commit 1280f59

Please sign in to comment.