Enabling word-level timestamps for Wav2Vec 2.0 (#3627)

Nithin-Holla · facebook-github-bot · commit 3c4a8e41559f · 2021-06-21T20:16:59.000-07:00
Summary: # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [x] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? ## What does this PR do? Fixes #3371. Currently, the output from Wav2Vec 2.0 decoding does not contain word-level start/end times, which can be useful for certain applications of ASR. Based on the discussion [here](flashlight/flashlight#618), they could be computed based on the output from the Flashlight decoder. For the KenLM decoder, we could first obtain the frame number corresponding to each non-blank token. Next, the timestamp of each character could be computed as `segment_start + frame_no/total_frames * segment_duration`. Finally, the start and end time of each word could be calculated based on the timestamp of the word boundary characters. In order to enable this, the frame number of each non-blank character is returned as a result of KenLM decoding. This is similar to the `timesteps` output from the [ctcdecode](https://github.com/parlance/ctcdecode#outputs-from-the-decode-method) library. ## PR review alexeib Pull Request resolved: #3627 Reviewed By: michaelauli Differential Revision: D29282488 Pulled By: alexeib fbshipit-source-id: b5fe64bf50abd7ef8e9539f4e338937c866eb0ca
diff --git a/examples/speech_recognition/new/decoders/flashlight_decoder.py b/examples/speech_recognition/new/decoders/flashlight_decoder.py
@@ -118,6 +118,27 @@ def __init__(self, cfg: FlashlightDecoderConfig, tgt_dict: Dictionary) -> None:
                 self.decoder_opts, self.lm, self.silence, self.blank, []
             )
 
+    def get_timesteps(self, token_idxs: List[int]) -> List[int]:
+        """Returns frame numbers corresponding to every non-blank token.
+
+        Parameters
+        ----------
+        token_idxs : List[int]
+            IDs of decoded tokens.
+
+        Returns
+        -------
+        List[int]
+            Frame numbers corresponding to every non-blank token.
+        """
+        timesteps = []
+        for i, token_idx in enumerate(token_idxs):
+            if token_idx == self.blank:
+                continue
+            if i == 0 or token_idx != token_idxs[i-1]:
+                timesteps.append(i)
+        return timesteps
+
     def decode(
         self,
         emissions: torch.FloatTensor,
@@ -134,6 +155,7 @@ def decode(
                     {
                         "tokens": self.get_tokens(result.tokens),
                         "score": result.score,
+                        "timesteps": self.get_timesteps(result.tokens),
                         "words": [
                             self.word_dict.get_entry(x) for x in result.words if x >= 0
                         ],
diff --git a/examples/speech_recognition/w2l_decoder.py b/examples/speech_recognition/w2l_decoder.py
@@ -12,6 +12,7 @@
 import gc
 import itertools as it
 import os.path as osp
+from typing import List
 import warnings
 from collections import deque, namedtuple
 
@@ -194,6 +195,26 @@ def __init__(self, args, tgt_dict):
                 self.decoder_opts, self.lm, self.silence, self.blank, []
             )
 
+    def get_timesteps(self, token_idxs: List[int]) -> List[int]:
+        """Returns frame numbers corresponding to every non-blank token.
+
+        Parameters
+        ----------
+        token_idxs : List[int]
+            IDs of decoded tokens.
+
+        Returns
+        -------
+        List[int]
+            Frame numbers corresponding to every non-blank token.
+        """
+        timesteps = []
+        for i, token_idx in enumerate(token_idxs):
+            if token_idx == self.blank:
+                continue
+            if i == 0 or token_idx != token_idxs[i-1]:
+                timesteps.append(i)
+        return timesteps
 
     def decode(self, emissions):
         B, T, N = emissions.size()
@@ -208,6 +229,7 @@ def decode(self, emissions):
                     {
                         "tokens": self.get_tokens(result.tokens),
                         "score": result.score,
+                        "timesteps": self.get_timesteps(result.tokens),
                         "words": [
                             self.word_dict.get_entry(x) for x in result.words if x >= 0
                         ],