Unify input for TorchScript Tensorizers and Models (#1256)

hudeven · facebook-github-bot · commit 898c41c4881f · 2020-02-25T11:58:56.000-08:00
Summary: Pull Request resolved: #1256 ## Dataflow from PyText client to TorchScript model in predictor 1. Client sends optional "texts", "tokens", "languages", "dense_feat" args in predictor request 2. Predictor pass them to forward() of TorchScript Module(Tensorizer + Model) 3. In ScriptPyTextModule(texts, tokens, languages, dense_feat), the args are converted to ScriptBatchInput(NamedTuple) => Tensorizer.forward() => tuple of Tensors => Model.forward() ## Before: We need a wrapper for each combination of (texts, tokens, languages, dense) ## After: a wrapper for with dense, another wrapper for without dense feature ## Alternative: ScriptPyTextModule(inputs: ScriptBatchInput) after NamedTuple is supported in client example => predictor => ScriptPyTextModule https://fb.workplace.com/groups/811605488888068/permalink/3266598560055403/ Reviewed By: snisarg, chenyangyu1988 Differential Revision: D19900062 fbshipit-source-id: 2f3884d8d93c5b4d67b78033e9fc92b5b7b6e2fe
diff --git a/pytext/data/bert_tensorizer.py b/pytext/data/bert_tensorizer.py
@@ -11,7 +11,7 @@
 from pytext.data.tokenizers import Tokenizer, WordPieceTokenizer
 from pytext.data.utils import BOS, EOS, MASK, PAD, UNK, SpecialToken, Vocabulary
 from pytext.torchscript.tensorizer.tensorizer import VocabLookup
-from pytext.torchscript.utils import pad_2d, pad_2d_mask
+from pytext.torchscript.utils import ScriptBatchInput, pad_2d, pad_2d_mask
 from pytext.torchscript.vocab import ScriptVocabulary
 from pytext.utils.file_io import PathManager
 from pytext.utils.lazy import lazy_property
@@ -207,9 +207,7 @@ def tokenize(
         return per_sentence_tokens
 
     def forward(
-        self,
-        texts: Optional[List[List[str]]] = None,
-        pre_tokenized: Optional[List[List[List[str]]]] = None,
+        self, inputs: ScriptBatchInput
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Wire up tokenize(), numberize() and tensorize() functions for data
@@ -223,10 +221,10 @@ def forward(
         seq_lens_1d: List[int] = []
         positions_2d: List[List[int]] = []
 
-        for idx in range(self.batch_size(texts, pre_tokenized)):
+        for idx in range(self.batch_size(inputs)):
             tokens: List[List[Tuple[str, int, int]]] = self.tokenize(
-                self.get_texts_by_index(texts, idx),
-                self.get_tokens_by_index(pre_tokenized, idx),
+                self.get_texts_by_index(inputs.texts, idx),
+                self.get_tokens_by_index(inputs.tokens, idx),
             )
 
             numberized: Tuple[List[int], List[int], int, List[int]] = self.numberize(
diff --git a/pytext/data/tensorizers.py b/pytext/data/tensorizers.py
@@ -21,6 +21,7 @@
 from pytext.data.sources.data_source import Gazetteer
 from pytext.data.tokenizers import Token, Tokenizer
 from pytext.torchscript.tensorizer import VectorNormalizer
+from pytext.torchscript.utils import ScriptBatchInput
 from pytext.utils import cuda, precision
 from pytext.utils.data import Slot
 from pytext.utils.file_io import PathManager
@@ -116,19 +117,19 @@ def __init__(self):
     def set_device(self, device: str):
         self.device = device
 
-    def batch_size(
-        self, texts: Optional[List[List[str]]], tokens: Optional[List[List[List[str]]]]
-    ) -> int:
+    def batch_size(self, inputs: ScriptBatchInput) -> int:
+        texts: Optional[List[List[str]]] = inputs.texts
+        tokens: Optional[List[List[List[str]]]] = inputs.tokens
         if texts is not None:
             return len(texts)
         elif tokens is not None:
             return len(tokens)
         else:
             raise RuntimeError("Empty input for both texts and tokens.")
 
-    def row_size(
-        self, texts: Optional[List[List[str]]], tokens: Optional[List[List[List[str]]]]
-    ) -> int:
+    def row_size(self, inputs: ScriptBatchInput) -> int:
+        texts: Optional[List[List[str]]] = inputs.texts
+        tokens: Optional[List[List[List[str]]]] = inputs.tokens
         if texts is not None:
             return len(texts[0])
         elif tokens is not None:
@@ -139,14 +140,14 @@ def row_size(
     def get_texts_by_index(
         self, texts: Optional[List[List[str]]], index: int
     ) -> Optional[List[str]]:
-        if texts is None:
+        if texts is None or len(texts) == 0:
             return None
         return texts[index]
 
     def get_tokens_by_index(
         self, tokens: Optional[List[List[List[str]]]], index: int
     ) -> Optional[List[List[str]]]:
-        if tokens is None:
+        if tokens is None or len(tokens) == 0:
             return None
         return tokens[index]
 
diff --git a/pytext/data/xlm_tensorizer.py b/pytext/data/xlm_tensorizer.py
@@ -14,6 +14,7 @@
 from pytext.data.tokenizers import Tokenizer
 from pytext.data.utils import EOS, MASK, PAD, UNK, Vocabulary
 from pytext.data.xlm_constants import LANG2ID_15
+from pytext.torchscript.utils import ScriptBatchInput
 from pytext.torchscript.vocab import ScriptVocabulary
 from pytext.utils.file_io import PathManager
 from pytext.utils.lazy import lazy_property
@@ -71,17 +72,15 @@ def numberize(
         return tokens, segment_labels, seq_len, positions
 
     def forward(
-        self,
-        texts: Optional[List[List[str]]] = None,
-        pre_tokenized: Optional[List[List[List[str]]]] = None,
-        languages: Optional[List[List[str]]] = None,
+        self, inputs: ScriptBatchInput
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Wire up tokenize(), numberize() and tensorize() functions for data
         processing.
         """
-        batch_size: int = self.batch_size(texts, pre_tokenized)
-        row_size: int = self.row_size(texts, pre_tokenized)
+        batch_size: int = self.batch_size(inputs)
+        row_size: int = self.row_size(inputs)
+        languages: Optional[List[List[str]]] = inputs.languages
         if languages is None:
             languages = [[self.default_language] * row_size] * batch_size
 
@@ -90,10 +89,10 @@ def forward(
         seq_lens_1d: List[int] = []
         positions_2d: List[List[int]] = []
 
-        for idx in range(self.batch_size(texts, pre_tokenized)):
+        for idx in range(self.batch_size(inputs)):
             tokens: List[List[Tuple[str, int, int]]] = self.tokenize(
-                self.get_texts_by_index(texts, idx),
-                self.get_tokens_by_index(pre_tokenized, idx),
+                self.get_texts_by_index(inputs.texts, idx),
+                self.get_tokens_by_index(inputs.tokens, idx),
             )
             language_ids: List[int] = [
                 self.language_vocab.idx.get(
diff --git a/pytext/models/doc_model.py b/pytext/models/doc_model.py
@@ -119,7 +119,15 @@ def __init__(self):
                 self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
 
             @jit.script_method
-            def forward(self, tokens: List[List[str]]):
+            def forward(
+                self,
+                texts: Optional[List[str]] = None,
+                tokens: Optional[List[List[str]]] = None,
+                languages: Optional[List[str]] = None,
+            ):
+                if tokens is None:
+                    raise RuntimeError("tokens is required")
+
                 seq_lens = make_sequence_lengths(tokens)
                 word_ids = self.vocab.lookup_indices_2d(tokens)
                 word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
@@ -136,11 +144,23 @@ def __init__(self):
                 self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
 
             @jit.script_method
-            def forward(self, tokens: List[List[str]], dense_feat: List[List[float]]):
+            def forward(
+                self,
+                texts: Optional[List[str]] = None,
+                tokens: Optional[List[List[str]]] = None,
+                languages: Optional[List[str]] = None,
+                dense_feat: Optional[List[List[float]]] = None,
+            ):
+                if tokens is None:
+                    raise RuntimeError("tokens is required")
+
                 seq_lens = make_sequence_lengths(tokens)
                 word_ids = self.vocab.lookup_indices_2d(tokens)
                 word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
-                dense_feat = self.normalizer.normalize(dense_feat)
+                if dense_feat is not None:
+                    dense_feat = self.normalizer.normalize(dense_feat)
+                else:
+                    raise RuntimeError("dense is required")
                 logits = self.model(
                     torch.tensor(word_ids),
                     torch.tensor(seq_lens),
diff --git a/pytext/models/roberta.py b/pytext/models/roberta.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
-from typing import Dict, List, Tuple
+from typing import Dict, Tuple
 
 import torch
 from pytext.common.constants import Stage
@@ -25,7 +25,7 @@
 from pytext.models.representations.transformer_sentence_encoder_base import (
     TransformerSentenceEncoderBase,
 )
-from pytext.torchscript.module import get_script_module_cls
+from pytext.torchscript.module import ScriptPyTextModule
 from pytext.utils.file_io import PathManager
 from pytext.utils.usage import log_class_usage
 from torch.serialization import default_restore_location
@@ -152,11 +152,7 @@ def torchscriptify(self, tensorizers, traced_model):
         values according to the output layer (eg. as a dict mapping class name to score)
         """
         script_tensorizer = tensorizers["tokens"].torchscriptify()
-        script_module_cls = get_script_module_cls(
-            script_tensorizer.tokenizer.input_type()
-        )
-
-        return script_module_cls(
+        return ScriptPyTextModule(
             model=traced_model,
             output_layer=self.output_layer.torchscript_predictions(),
             tensorizer=script_tensorizer,
diff --git a/pytext/torchscript/module.py b/pytext/torchscript/module.py
@@ -1,20 +1,10 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-
 from typing import List, Optional
 
 import torch
 from pytext.torchscript.tensorizer.tensorizer import ScriptTensorizer
-from pytext.torchscript.utils import ScriptInputType, squeeze_1d, squeeze_2d
-
-
-def get_script_module_cls(input_type: ScriptInputType) -> torch.jit.ScriptModule:
-    if input_type.is_text():
-        return ScriptTextModule
-    elif input_type.is_token():
-        return ScriptTokenModule
-    else:
-        raise RuntimeError("Only support text or token input type...")
+from pytext.torchscript.utils import ScriptBatchInput, squeeze_1d, squeeze_2d
 
 
 class ScriptModule(torch.jit.ScriptModule):
@@ -23,7 +13,7 @@ def set_device(self, device: str):
         self.tensorizer.set_device(device)
 
 
-class ScriptTextModule(ScriptModule):
+class ScriptPyTextModule(ScriptModule):
     def __init__(
         self,
         model: torch.jit.ScriptModule,
@@ -36,73 +26,36 @@ def __init__(
         self.tensorizer = tensorizer
 
     @torch.jit.script_method
-    def forward(self, texts: List[str]):
-        input_tensors = self.tensorizer(texts=squeeze_1d(texts))
-        logits = self.model(input_tensors)
-        return self.output_layer(logits)
-
-
-class ScriptTokenModule(ScriptModule):
-    def __init__(
-        self,
-        model: torch.jit.ScriptModule,
-        output_layer: torch.jit.ScriptModule,
-        tensorizer: ScriptTensorizer,
-    ):
-        super().__init__()
-        self.model = model
-        self.output_layer = output_layer
-        self.tensorizer = tensorizer
-
-    @torch.jit.script_method
-    def forward(self, tokens: List[List[str]]):
-        input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens))
-        logits = self.model(input_tensors)
-        return self.output_layer(logits)
-
-
-class ScriptTokenLanguageModule(ScriptModule):
-    def __init__(
+    def forward(
         self,
-        model: torch.jit.ScriptModule,
-        output_layer: torch.jit.ScriptModule,
-        tensorizer: ScriptTensorizer,
+        texts: Optional[List[str]] = None,
+        tokens: Optional[List[List[str]]] = None,
+        languages: Optional[List[str]] = None,
     ):
-        super().__init__()
-        self.model = model
-        self.output_layer = output_layer
-        self.tensorizer = tensorizer
-
-    @torch.jit.script_method
-    def forward(self, tokens: List[List[str]], languages: Optional[List[str]] = None):
-        input_tensors = self.tensorizer(
-            pre_tokenized=squeeze_2d(tokens), languages=squeeze_1d(languages)
+        inputs: ScriptBatchInput = ScriptBatchInput(
+            texts=squeeze_1d(texts),
+            tokens=squeeze_2d(tokens),
+            languages=squeeze_1d(languages),
         )
+        input_tensors = self.tensorizer(inputs)
         logits = self.model(input_tensors)
         return self.output_layer(logits)
 
 
-class ScriptTokenLanguageModuleWithDenseFeature(ScriptModule):
-    def __init__(
-        self,
-        model: torch.jit.ScriptModule,
-        output_layer: torch.jit.ScriptModule,
-        tensorizer: ScriptTensorizer,
-    ):
-        super().__init__()
-        self.model = model
-        self.output_layer = output_layer
-        self.tensorizer = tensorizer
-
+class ScriptPyTextModuleWithDense(ScriptPyTextModule):
     @torch.jit.script_method
     def forward(
         self,
-        tokens: List[List[str]],
         dense_feat: List[List[float]],
+        texts: Optional[List[str]] = None,
+        tokens: Optional[List[List[str]]] = None,
         languages: Optional[List[str]] = None,
     ):
-        input_tensors = self.tensorizer(
-            pre_tokenized=squeeze_2d(tokens), languages=squeeze_1d(languages)
+        inputs: ScriptBatchInput = ScriptBatchInput(
+            texts=squeeze_1d(texts),
+            tokens=squeeze_2d(tokens),
+            languages=squeeze_1d(languages),
         )
+        input_tensors = self.tensorizer(inputs)
         logits = self.model(input_tensors, torch.tensor(dense_feat).float())
         return self.output_layer(logits)
diff --git a/pytext/torchscript/tests/test_tensorizer.py b/pytext/torchscript/tests/test_tensorizer.py
@@ -13,7 +13,7 @@
 )
 from pytext.torchscript.tensorizer.tensorizer import VocabLookup
 from pytext.torchscript.tokenizer import ScriptDoNothingTokenizer
-from pytext.torchscript.tokenizer.tokenizer import ScriptTextTokenizerBase
+from pytext.torchscript.tokenizer.tokenizer import ScriptTokenizerBase
 from pytext.torchscript.utils import squeeze_1d, squeeze_2d
 from pytext.torchscript.vocab import ScriptVocabulary
 
@@ -26,7 +26,7 @@ def _mock_vocab(self):
         )
 
     def _mock_tokenizer(self):
-        class MockTokenizer(ScriptTextTokenizerBase):
+        class MockTokenizer(ScriptTokenizerBase):
             def __init__(self, tokens: List[Tuple[str, int, int]]):
                 super().__init__()
                 self.tokens = torch.jit.Attribute(tokens, List[Tuple[str, int, int]])
diff --git a/pytext/torchscript/tokenizer/__init__.py b/pytext/torchscript/tokenizer/__init__.py
@@ -2,18 +2,12 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
 from .bpe import ScriptBPE
-from .tokenizer import (
-    ScriptBPETokenizer,
-    ScriptDoNothingTokenizer,
-    ScriptTextTokenizerBase,
-    ScriptTokenTokenizerBase,
-)
+from .tokenizer import ScriptBPETokenizer, ScriptDoNothingTokenizer, ScriptTokenizerBase
 
 
 __all__ = [
     "ScriptBPE",
     "ScriptBPETokenizer",
     "ScriptDoNothingTokenizer",
-    "ScriptTextTokenizerBase",
-    "ScriptTokenTokenizerBase",
+    "ScriptTokenizerBase",
 ]
diff --git a/pytext/torchscript/tokenizer/tokenizer.py b/pytext/torchscript/tokenizer/tokenizer.py
diff --git a/pytext/torchscript/utils.py b/pytext/torchscript/utils.py