Migrate pytext/utils/torch.py logic into pytext/torchscript/ for long term maintainability (#1082)

chenyangyu1988 · facebook-github-bot · commit dbad1917734d · 2019-10-29T13:36:41.000-07:00
Summary: Pull Request resolved: #1082 Migrate pytext/utils/torch.py logic into pytext/torchscript/ for long term maintainability Differential Revision: D18207798 fbshipit-source-id: c5680edf99d20b4e46fa887d4865dce138da885a
diff --git a/pytext/torchscript/tensorizer/__init__.py b/pytext/torchscript/tensorizer/__init__.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
-from pytext.torchscript.tensorizer.bert import ScriptBERTTensorizer
-from pytext.torchscript.tensorizer.roberta import ScriptRoBERTaTensorizer
+from .bert import ScriptBERTTensorizer
+from .normalizer import VectorNormalizer
+from .roberta import ScriptRoBERTaTensorizer
 
 
-__all__ = ["ScriptBERTTensorizer", "ScriptRoBERTaTensorizer"]
+__all__ = ["ScriptBERTTensorizer", "ScriptRoBERTaTensorizer", "VectorNormalizer"]
diff --git a/pytext/torchscript/tensorizer/bert.py b/pytext/torchscript/tensorizer/bert.py
@@ -4,7 +4,8 @@
 from typing import List, Optional, Tuple
 
 import torch
-from pytext.utils.torch import Vocabulary as ScriptVocabulary, pad_2d_mask
+from pytext.torchscript.utils import pad_2d_mask
+from pytext.torchscript.vocab import ScriptVocabulary
 
 from .tensorizer import ScriptTensorizer, VocabLookup
 
diff --git a/pytext/torchscript/tensorizer/normalizer.py b/pytext/torchscript/tensorizer/normalizer.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import List
+
+import torch
+
+
+class VectorNormalizer(torch.nn.Module):
+    """Performs in-place normalization over all features of a dense feature
+    vector by doing (x - mean)/stddev for each x in the feature vector.
+
+    This is a ScriptModule so that the normalize function can be called at
+    training time in the tensorizer, as well as at inference time by using it in
+    your torchscript forward function. To use this in your tensorizer
+    update_meta_data must be called once per row in your initialize function,
+    and then calculate_feature_stats must be called upon the last time it runs.
+    See usage in FloatListTensorizer for an example.
+
+    Setting do_normalization=False will make the normalize function an identity
+    function.
+    """
+
+    def __init__(self, dim: int, do_normalization: bool = True):
+        super().__init__()
+        self.num_rows = 0
+        self.feature_sums = [0] * dim
+        self.feature_squared_sums = [0] * dim
+        self.do_normalization = do_normalization
+        self.feature_avgs = [0.0] * dim
+        self.feature_stddevs = [1.0] * dim
+
+    def __getstate__(self):
+        return {
+            "num_rows": self.num_rows,
+            "feature_sums": self.feature_sums,
+            "feature_squared_sums": self.feature_squared_sums,
+            "do_normalization": self.do_normalization,
+            "feature_avgs": self.feature_avgs,
+            "feature_stddevs": self.feature_stddevs,
+        }
+
+    def __setstate__(self, state):
+        self.num_rows = state["num_rows"]
+        self.feature_sums = state["feature_sums"]
+        self.feature_squared_sums = state["feature_squared_sums"]
+        self.do_normalization = state["do_normalization"]
+        self.feature_avgs = state["feature_avgs"]
+        self.feature_stddevs = state["feature_stddevs"]
+
+    # TODO: this is only to satisfy the TorchScript compiler.
+    # Can remove when D17551196 lands
+    def forward(self):
+        pass
+
+    def update_meta_data(self, vec):
+        if self.do_normalization:
+            self.num_rows += 1
+            for i in range(len(vec)):
+                self.feature_sums[i] += vec[i]
+                self.feature_squared_sums[i] += vec[i] ** 2
+
+    def calculate_feature_stats(self):
+        if self.do_normalization:
+            self.feature_avgs = [x / self.num_rows for x in self.feature_sums]
+            self.feature_stddevs = [
+                (
+                    (self.feature_squared_sums[i] / self.num_rows)
+                    - (self.feature_avgs[i] ** 2)
+                )
+                ** 0.5
+                for i in range(len(self.feature_squared_sums))
+            ]
+
+    def normalize(self, vec: List[List[float]]):
+        if self.do_normalization:
+            for i in range(len(vec)):
+                for j in range(len(vec[i])):
+                    vec[i][j] -= self.feature_avgs[j]
+                    vec[i][j] /= (
+                        self.feature_stddevs[j] if self.feature_stddevs[j] != 0 else 1.0
+                    )
+        return vec
diff --git a/pytext/torchscript/tensorizer/tensorizer.py b/pytext/torchscript/tensorizer/tensorizer.py
@@ -4,7 +4,7 @@
 from typing import List, Optional, Tuple
 
 import torch
-from pytext.utils.torch import Vocabulary as ScriptVocabulary
+from pytext.torchscript.vocab import ScriptVocabulary
 
 
 class ScriptTensorizer(torch.jit.ScriptModule):
@@ -29,6 +29,10 @@ def tensorize(self, rows):
 
 
 class VocabLookup(torch.jit.ScriptModule):
+    """
+    TorchScript implementation of lookup_tokens() in pytext/data/tensorizers.py
+    """
+
     def __init__(self, vocab: ScriptVocabulary):
         super().__init__()
         self.vocab = vocab
diff --git a/pytext/torchscript/tests/test_tensorizer.py b/pytext/torchscript/tests/test_tensorizer.py
@@ -8,7 +8,7 @@
 import torch
 from pytext.torchscript.tensorizer import ScriptBERTTensorizer, ScriptRoBERTaTensorizer
 from pytext.torchscript.tensorizer.tensorizer import VocabLookup
-from pytext.utils.torch import Vocabulary as ScriptVocabulary
+from pytext.torchscript.vocab import ScriptVocabulary
 
 
 class TensorizerTest(unittest.TestCase):
diff --git a/pytext/torchscript/tests/test_tokenizer.py b/pytext/torchscript/tests/test_tokenizer.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import io
+import pickle
+import unittest
+
+import torch
+from pytext.torchscript.tokenizer import ScriptBPE
+from pytext.torchscript.utils import make_byte_inputs, utf8_chars
+
+
+BPE_VOCAB_FILE = io.StringIO(
+    """
+hello_EOW 20
+world_EOW 18
+th  17
+is_EOW 16
+bpe_EOW 15
+! 14
+h 13
+t 6
+s_EOW 2
+i -1
+今_EOW -2
+"""
+)
+
+
+class BPETest(unittest.TestCase):
+    def test_utf8_chars(self):
+        words = ["hello", "💩", "¯\\_(ツ)_/¯", "今日"]
+        for word in words:
+            self.assertEqual(list(word), utf8_chars(word))
+
+    def test_simple_bpe(self):
+        BPE_VOCAB_FILE.seek(0)
+        bpe = ScriptBPE.from_vocab_file(BPE_VOCAB_FILE)
+        tokenized = bpe.tokenize(["hello", "world", "this", "is", "bpe", "今日"])
+        self.assertEqual(
+            ["hello_EOW", "world_EOW", "th", "is_EOW", "is_EOW", "bpe_EOW", "今_EOW"],
+            tokenized,
+        )
+
+    def test_pickle_bpe(self):
+        BPE_VOCAB_FILE.seek(0)
+        original_bpe = ScriptBPE.from_vocab_file(BPE_VOCAB_FILE)
+        bpe = pickle.loads(pickle.dumps(original_bpe))
+        tokenized = bpe.tokenize(["hello", "world", "this", "is", "bpe", "今日"])
+        self.assertEqual(
+            ["hello_EOW", "world_EOW", "th", "is_EOW", "is_EOW", "bpe_EOW", "今_EOW"],
+            tokenized,
+        )
+
+    def test_make_bytes_input(self):
+        s1 = "I want some coffee today"
+        s2 = "Turn it up"
+        max_char_length = 5
+
+        batch = [s1.split(), s2.split()]
+        bytes, seq_lens = make_byte_inputs(batch, max_char_length)
+
+        def to_bytes(word, pad_to):
+            return list(word.encode()) + [0] * (pad_to - len(word))
+
+        expected_bytes = [
+            [
+                to_bytes("I", 5),
+                to_bytes("want", 5),
+                to_bytes("some", 5),
+                to_bytes("coffe", 5),
+                to_bytes("today", 5),
+            ],
+            [
+                to_bytes("Turn", 5),
+                to_bytes("it", 5),
+                to_bytes("up", 5),
+                to_bytes("", 5),
+                to_bytes("", 5),
+            ],
+        ]
+        expected_seq_lens = [5, 3]
+
+        self.assertIsInstance(bytes, torch.LongTensor)
+        self.assertIsInstance(seq_lens, torch.LongTensor)
+        self.assertEqual(bytes.tolist(), expected_bytes)
+        self.assertEqual(seq_lens.tolist(), expected_seq_lens)
diff --git a/pytext/torchscript/tests/test_vocab.py b/pytext/torchscript/tests/test_vocab.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import unittest
+
+import torch
+from pytext.torchscript.vocab import ScriptVocabulary
+from torch import jit
+
+
+class VocabTest(unittest.TestCase):
+    def setUp(self):
+        vocab_list = ["UNK", "a", "b", "c", "d"]
+        self.vocab = ScriptVocabulary(vocab_list)
+
+    def test_vocab_lookup(self):
+        # There are bugs with just making this a script, eventually these can be simpler
+        class LookupWord(jit.ScriptModule):
+            def __init__(self, vocab):
+                super().__init__()
+                self.vocab = vocab
+
+            @jit.script_method
+            def forward(self, word: str):
+                return self.vocab.idx[word]
+
+        lookup_word = LookupWord(self.vocab)
+
+        self.assertEqual(1, lookup_word("a"))
+        self.assertEqual(3, lookup_word("c"))
+        with self.assertRaises(Exception):
+            lookup_word("notaword")
+
+    def test_vocab_idx_lookup(self):
+        # There are bugs with just making this a script, eventually these can be simpler
+        class LookupIndex(jit.ScriptModule):
+            def __init__(self, vocab):
+                super().__init__()
+                self.vocab = vocab
+
+            @jit.script_method
+            def forward(self, i: int):
+                return self.vocab.vocab[i]
+
+        lookup_idx = LookupIndex(self.vocab)
+
+        self.assertEqual("UNK", lookup_idx(0))
+        self.assertEqual("b", lookup_idx(2))
+        with self.assertRaises(Exception):
+            lookup_idx(20)
+
+    def test_lookup_1d(self):
+        self.assertEqual(
+            [1, 0, 3, 4], self.vocab.lookup_indices_1d(["a", "e", "c", "d"])
+        )
+        self.assertEqual([], self.vocab.lookup_indices_1d([]))
+
+    def test_lookup_2d(self):
+        self.assertEqual(
+            [[1, 0, 3, 4], [], [2]],
+            self.vocab.lookup_indices_2d([["a", "e", "c", "d"], [], ["b"]]),
+        )
+        self.assertEqual([], self.vocab.lookup_indices_2d([]))
+
+    def test_custom_unk(self):
+        vocab_list = ["a", "UNK", "b", "c", "d"]
+        vocab = ScriptVocabulary(vocab_list, unk_idx=1)
+        self.assertEqual([0, 1, 3, 4], vocab.lookup_indices_1d(["a", "e", "c", "d"]))
+
+    def test_lookup_words_1d_cycle_heuristic(self):
+        self.assertEqual(
+            self.vocab.lookup_words_1d_cycle_heuristic(
+                torch.tensor([1, 0, 0]), [], ["y", "z"]
+            ),
+            ["a", "y", "z"],
+        )
diff --git a/pytext/torchscript/tokenizer/__init__.py b/pytext/torchscript/tokenizer/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .bpe import ScriptBPE
+
+
+__all__ = ["ScriptBPE"]
diff --git a/pytext/torchscript/tokenizer/bpe.py b/pytext/torchscript/tokenizer/bpe.py
diff --git a/pytext/torchscript/utils.py b/pytext/torchscript/utils.py
diff --git a/pytext/torchscript/vocab.py b/pytext/torchscript/vocab.py