Always create non-empty logits / targets for MLM (#979)

Michael Wu · facebook-github-bot · commit 89a45ec6ba20 · 2019-09-17T10:48:43.000-07:00
Summary: Pull Request resolved: #979 D17241503 selects only masked tokens for the final logits / targets during MLM. This fails when there are no masked tokens (e.g. at the end of a file there can be a very short batch). In this case, select just the first token in the first batch. BTW, the new masking strategy is faster (20%) than the old one - f138005051 vs f138005044. Reviewed By: borguz Differential Revision: D17370855 fbshipit-source-id: 62f0540fb94819c6a269ea067b1c0b9e08c82119
diff --git a/pytext/metric_reporters/language_model_metric_reporter.py b/pytext/metric_reporters/language_model_metric_reporter.py
@@ -230,7 +230,7 @@ def report_realtime_metric(self, stage):
         print(
             f"Tokens/s: {last_batch_tps:.0f}, "
             f"batch ppl: {math.exp(last_batch_loss):.2f}, "
-            f"agg ppl: {math.exp(aggregate_loss / float(total_masked_tokens)):.2f}, "
+            f"agg ppl: {math.exp(self._calculate_loss(aggregate_loss, total_masked_tokens)):.2f}, "
             f"number of batches: {self.total_batches:.0f}, "
             f"accumulated tokens/s: {tps:.0f}",
             flush=True,
@@ -239,7 +239,7 @@ def report_realtime_metric(self, stage):
         print(
             f"GPU-0 tokens/s: {self.last_batch_tps:.0f}, "
             f"batch ppl: {math.exp(self.last_batch_loss):.2f}, "
-            f"agg ppl: {math.exp(self.aggregate_loss / float(self.total_masked_tokens)):.2f}, "
+            f"agg ppl: {math.exp(self.calculate_loss()):.2f}, "
             f"number of batches: {self.total_batches}, "
             f"accumulated tokens/s: {self.realtime_meters['tps'].avg:.0f}",
             flush=True,
@@ -261,7 +261,10 @@ def report_realtime_metric(self, stage):
             )
 
     def calculate_loss(self) -> float:
-        return self.aggregate_loss / float(self.total_masked_tokens)
+        return self._calculate_loss(self.aggregate_loss, self.total_masked_tokens)
+
+    def _calculate_loss(self, aggregate_loss, total_masked_tokens) -> float:
+        return aggregate_loss / max(1, total_masked_tokens)
 
     def _reset(self):
         super()._reset()
diff --git a/pytext/models/masked_lm.py b/pytext/models/masked_lm.py
@@ -166,6 +166,10 @@ def _get_mask(self, tokens):
         mask = self._select_tokens_to_mask(tokens, self.mask_prob)
         pad_mask = (tokens != self.vocab.get_pad_index()).long()
         mask *= pad_mask
+        if not mask.byte().any():
+            # Keep one masked token to avoid failure in the loss calculation.
+            mask[0, 0] = 1
+
         probs = torch.rand_like(tokens, dtype=torch.float)
         rand_mask = (probs < 0.1).long() * mask
         mask_mask = (probs >= 0.2).long() * mask