Skip to content

Commit

Permalink
Apply new way of resetting the loss
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun committed Jul 5, 2024
1 parent 038dce8 commit 4d45c5c
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions optimum/neuron/trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def _reduce_loss(self, tr_loss: torch.Tensor) -> torch.Tensor:
else:
dp_size = xm.xrt_world_size()

# tr_loss = tr_loss - self._prev_tr_loss
tr_loss = tr_loss - self._prev_tr_loss
tr_loss_div = tr_loss / dp_size

if self.args.mp_plugin.should_parallelize:
Expand Down Expand Up @@ -892,8 +892,12 @@ def _inner_training_loop(

# tr_loss is a tensor to avoid synchronization of TPUs through .item()
tr_loss = torch.tensor(0.0).to(args.device)
# `_prev_tr_loss` is used to keep track of the previously saved loss. This way we do not create multiple graphs when resetting it.

# `_prev_tr_loss` is used to keep track of the previously saved loss.
# By doing that, we do not have to do `tr_loss.zero_()` when logging the loss.
# This way we do not create multiple graphs depending on the fact that we are logging or not.
self._prev_tr_loss = torch.tensor(0.0).to(args.device)

# _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
self._total_loss_scalar = 0.0
self._globalstep_last_logged = self.state.global_step
Expand Down

0 comments on commit 4d45c5c

Please sign in to comment.