fix MultipleData by making tensorizers able to initialize from multiple data sources (#972)

Haoran Li · facebook-github-bot · commit 01594fd6bf09 · 2019-09-13T12:48:38.000-07:00
Summary: Pull Request resolved: #972 For the newly added Data that could read from multiple data sources, there are issues when initializing tensorizers, tensorizers will only be initialized with the last data source, this diff makes tensorizers with vocab able to initiate from multiple data sources by introducing a new parameter in tensorizer initialize. When from_scratch is set to False for Data, it allows tensorizer to accumulate vocab from multiple data sources. I modified each tensorizer accordingly depending on its implementation, the basic change is to not create a new vocab builder when from_scratch is False. Reviewed By: rutyrinott Differential Revision: D17301822 fbshipit-source-id: 041d0bc6aeff5e0ff690f6d43df416d898357f57
diff --git a/pytext/data/bert_tensorizer.py b/pytext/data/bert_tensorizer.py
@@ -67,6 +67,12 @@ def __init__(self, columns, **kwargs):
         self.columns = columns
         # Manually initialize column_schema since we are sending None to TokenTensorizer
 
+    def initialize(self, vocab_builder=None, from_scratch=True):
+        # vocab for BERT is already set
+        return
+        # we need yield here to make this function a generator
+        yield
+
     @property
     def column_schema(self):
         return [(column, str) for column in self.columns]
diff --git a/pytext/data/data.py b/pytext/data/data.py
@@ -264,6 +264,7 @@ def __init__(
         sort_key: Optional[str] = None,
         in_memory: Optional[bool] = False,
         init_tensorizers: Optional[bool] = True,
+        init_tensorizers_from_scratch: Optional[bool] = True,
     ):
         """This function should also initialize the passed in tensorizers with
         metadata they need for model construction."""
@@ -280,7 +281,9 @@ def __init__(
             else data_source.train
         )
         if init_tensorizers:
-            initialize_tensorizers(self.tensorizers, full_train_data)
+            initialize_tensorizers(
+                self.tensorizers, full_train_data, init_tensorizers_from_scratch
+            )
         else:
             print(
                 "Skipped initializing tensorizers since they are loaded from a "
diff --git a/pytext/data/disjoint_multitask_data.py b/pytext/data/disjoint_multitask_data.py
@@ -59,11 +59,11 @@ def __init__(
         test_key: str = None,
         task_key: str = BatchContext.TASK_NAME,
     ) -> None:
-        test_key = test_key or list(data_dict)[0]
+        self.test_key = test_key or list(data_dict)[0]
         # currently the way training is set up is that, the data object needs
         # to specify a data_source which is used at test time. For multitask
         # this is set to the data_source associated with the test_key
-        self.data_source = data_dict[test_key].data_source
+        self.data_source = data_dict[self.test_key].data_source
         super().__init__(self.data_source, {})
         self.data_dict = data_dict
         self.samplers = samplers
@@ -74,11 +74,16 @@ def batches(self, stage: Stage, data_source=None):
         """Yield batches from each task, sampled according to a given sampler.
         This batcher additionally exposes a task name in the batch to allow the model
         to filter examples to the appropriate tasks."""
-        all_batches = {
-            name: task.batches(stage) for name, task in self.data_dict.items()
-        }
-        sampled_batches = self.samplers[stage].batchify(all_batches)
+        if data_source is not None:
+            # means being called in test workflow
+            for batch in self.data_dict[self.test_key].batches(stage, data_source):
+                yield batch
+        else:
+            all_batches = {
+                name: task.batches(stage) for name, task in self.data_dict.items()
+            }
+            sampled_batches = self.samplers[stage].batchify(all_batches)
 
-        for name, (raw_batch, batch) in sampled_batches:
-            batch[self.task_key] = name
-            yield BatchData(raw_batch, batch)
+            for name, (raw_batch, batch) in sampled_batches:
+                batch[self.task_key] = name
+                yield BatchData(raw_batch, batch)
diff --git a/pytext/data/squad_tensorizer.py b/pytext/data/squad_tensorizer.py
@@ -96,27 +96,29 @@ def __init__(
         self.answers_column = answers_column
         self.answer_starts_column = answer_starts_column
 
-    def initialize(self, vocab_builder=None):
+    def initialize(self, vocab_builder=None, from_scratch=True):
         """Build vocabulary based on training corpus."""
-        if not self.vocab:
+        if isinstance(self.tokenizer, WordPieceTokenizer):
+            return
+        if not self.vocab_builder or from_scratch:
             self.vocab_builder = vocab_builder or VocabBuilder()
             self.vocab_builder.pad_index = 0
             self.vocab_builder.unk_index = 1
-            ques_initializer = self.ques_tensorizer.initialize(self.vocab_builder)
-            doc_initializer = self.doc_tensorizer.initialize(self.vocab_builder)
-            ques_initializer.send(None)
-            doc_initializer.send(None)
+        ques_initializer = self.ques_tensorizer.initialize(
+            self.vocab_builder, from_scratch
+        )
+        doc_initializer = self.doc_tensorizer.initialize(
+            self.vocab_builder, from_scratch
+        )
+        ques_initializer.send(None)
+        doc_initializer.send(None)
         try:
             while True:
-                if self.vocab:
-                    yield
-                else:
-                    row = yield
-                    ques_initializer.send(row)
-                    doc_initializer.send(row)
+                row = yield
+                ques_initializer.send(row)
+                doc_initializer.send(row)
         except GeneratorExit:
-            if not self.vocab:
-                self.vocab = self.vocab_builder.make_vocab()
+            self.vocab = self.vocab_builder.make_vocab()
 
     def _lookup_tokens(self, text, source_is_doc=True):
         # This is useful in SquadMetricReporter._unnumberize()
diff --git a/pytext/data/tensorizers.py b/pytext/data/tensorizers.py
@@ -133,7 +133,7 @@ def tensorize(self, batch):
         """Tensorizer knows how to pad and tensorize a batch of it's own output."""
         return batch
 
-    def initialize(self):
+    def initialize(self, from_scratch=True):
         """
         The initialize function is carefully designed to allow us to read through the
         training dataset only once, and not store it in memory. As such, it can't itself
@@ -264,9 +264,9 @@ def _lookup_tokens(self, text=None, pre_tokenized=None):
     def _reverse_lookup(self, token_ids):
         return [self.vocab[id] for id in token_ids]
 
-    def initialize(self, vocab_builder=None):
+    def initialize(self, vocab_builder=None, from_scratch=True):
         """Build vocabulary based on training corpus."""
-        if self.vocab:
+        if self.vocab and from_scratch:
             if self.vocab_config.build_from_data or self.vocab_config.vocab_files:
                 print(
                     f"`{self.text_column}` column: vocab already provided, skipping "
@@ -279,10 +279,12 @@ def initialize(self, vocab_builder=None):
                 f"To create token tensorizer for '{self.text_column}', either "
                 f"`build_from_data` or `vocab_files` must be set."
             )
-
-        self.vocab_builder = vocab_builder or VocabBuilder()
-        self.vocab_builder.use_bos = self.add_bos_token
-        self.vocab_builder.use_eos = self.add_eos_token
+        if not self.vocab_builder:
+            # else means not initialize from scratch, self.vocab_builder
+            # would be set already
+            self.vocab_builder = vocab_builder or VocabBuilder()
+            self.vocab_builder.use_bos = self.add_bos_token
+            self.vocab_builder.use_eos = self.add_eos_token
         if not self.vocab_config.build_from_data:
             self._add_vocab_from_files()
             self.vocab = self.vocab_builder.make_vocab()
@@ -561,11 +563,11 @@ def __init__(
     def column_schema(self):
         return [(self.label_column, str)]
 
-    def initialize(self):
+    def initialize(self, from_scratch=True):
         """
         Look through the dataset for all labels and create a vocab map for them.
         """
-        if self.vocab:
+        if self.vocab and from_scratch:
             return
         try:
             while True:
@@ -652,11 +654,11 @@ def _get_row_value_as_str(self, row) -> str:
             row_value = str(row_value.item())
         return row_value
 
-    def initialize(self):
+    def initialize(self, from_scratch=True):
         """
         Look through the dataset for all uids and create a vocab map for them.
         """
-        if self.vocab:
+        if self.vocab and from_scratch:
             return
         try:
             while True:
@@ -881,24 +883,27 @@ def __init__(
         self.allow_unknown = allow_unknown
         self.tokenizer = tokenizer or Tokenizer()
         self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX
+        self.vocab_builder = VocabBuilder()
+        self.vocab_builder.add(NO_LABEL)
+        self.vocab_builder.use_pad = False
+        self.vocab_builder.use_unk = self.allow_unknown
+        self.vocab = None
 
     @property
     def column_schema(self):
         return [(self.text_column, str), (self.slot_column, List[Slot])]
 
-    def initialize(self):
+    def initialize(self, from_scratch=True):
         """Look through the dataset for all labels and create a vocab map for them."""
-        builder = VocabBuilder()
-        builder.add(NO_LABEL)
-        builder.use_pad = False
-        builder.use_unk = self.allow_unknown
+        if self.vocab and from_scratch:
+            return
         try:
             while True:
                 row = yield
                 slots = row[self.slot_column]
-                builder.add_all(s.label for s in slots)
+                self.vocab_builder.add_all(s.label for s in slots)
         except GeneratorExit:
-            self.vocab = builder.make_vocab()
+            self.vocab = self.vocab_builder.make_vocab()
 
     def numberize(self, row):
         """
@@ -993,23 +998,26 @@ def __init__(
         self.text_column = text_column
         self.dict_column = dict_column
         self.tokenizer = tokenizer or Tokenizer()
+        self.vocab_builder = VocabBuilder()
+        self.vocab = None
 
     @property
     def column_schema(self):
         return [(self.text_column, str), (self.dict_column, Gazetteer)]
 
-    def initialize(self):
+    def initialize(self, from_scratch=True):
         """
         Look through the dataset for all dict features to create vocab.
         """
-        builder = VocabBuilder()
+        if self.vocab and from_scratch:
+            return
         try:
             while True:
                 row = yield
                 for token_dict in row[self.dict_column]:
-                    builder.add_all(token_dict["features"])
+                    self.vocab_builder.add_all(token_dict["features"])
         except GeneratorExit:
-            self.vocab = builder.make_vocab()
+            self.vocab = self.vocab_builder.make_vocab()
 
     def numberize(self, row):
         """
@@ -1169,6 +1177,7 @@ def __init__(
         self.column = column
         self.tokenizer = tokenizer or Tokenizer()
         self.vocab = vocab
+        self.vocab_builder = None
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
         self.use_eos_token_for_bos = use_eos_token_for_bos
@@ -1181,24 +1190,25 @@ def __init__(
     def column_schema(self):
         return [(self.column, List[str])]
 
-    def initialize(self, vocab_builder=None):
+    def initialize(self, vocab_builder=None, from_scratch=True):
         """Build vocabulary based on training corpus."""
-        if self.vocab:
+        if self.vocab and from_scratch:
             return
-        vocab_builder = vocab_builder or VocabBuilder()
-        vocab_builder.use_bos = self.add_bos_token
-        vocab_builder.use_eos = self.add_eos_token
-        vocab_builder.use_bol = self.add_bol_token
-        vocab_builder.use_eol = self.add_eol_token
+        if not self.vocab_builder:
+            self.vocab_builder = vocab_builder or VocabBuilder()
+            self.vocab_builder.use_bos = self.add_bos_token
+            self.vocab_builder.use_eos = self.add_eos_token
+            self.vocab_builder.use_bol = self.add_bol_token
+            self.vocab_builder.use_eol = self.add_eol_token
 
         try:
             while True:
                 row = yield
                 for raw_text in row[self.column]:
                     tokenized = self.tokenizer.tokenize(raw_text)
-                    vocab_builder.add_all([t.value for t in tokenized])
+                    self.vocab_builder.add_all([t.value for t in tokenized])
         except GeneratorExit:
-            self.vocab = vocab_builder.make_vocab()
+            self.vocab = self.vocab_builder.make_vocab()
 
     _lookup_tokens = TokenTensorizer._lookup_tokens
     _tokenize = TokenTensorizer._tokenize
@@ -1274,27 +1284,29 @@ def from_config(cls, config: Config):
     def __init__(self, column: str = Config.column, vocab=None):
         self.column = column
         self.vocab = vocab
+        self.vocab_builder = None
 
     @property
     def column_schema(self):
         return [(self.column, List[str])]
 
-    def initialize(self, vocab_builder=None):
+    def initialize(self, vocab_builder=None, from_scratch=True):
         """Build vocabulary based on training corpus."""
-        if self.vocab:
+        if self.vocab and from_scratch:
             return
-        vocab_builder = vocab_builder or VocabBuilder()
-        vocab_builder.use_unk = False
-        vocab_builder.use_pad = False
+        if not self.vocab_builder:
+            self.vocab_builder = vocab_builder or VocabBuilder()
+            self.vocab_builder.use_unk = False
+            self.vocab_builder.use_pad = False
 
         try:
             while True:
                 row = yield
                 annotation = Annotation(row[self.column])
                 actions = annotation.tree.to_actions()
-                vocab_builder.add_all(actions)
+                self.vocab_builder.add_all(actions)
         except GeneratorExit:
-            self.vocab = vocab_builder.make_vocab()
+            self.vocab = self.vocab_builder.make_vocab()
             self.shift_idx = self.vocab.idx[SHIFT]
             self.reduce_idx = self.vocab.idx[REDUCE]
 
@@ -1378,11 +1390,16 @@ def tensorize(self, batch):
         return cuda.tensor(batch, torch.float)
 
 
-def initialize_tensorizers(tensorizers, data_source):
+def initialize_tensorizers(tensorizers, data_source, from_scratch=True):
     """A utility function to stream a data source to the initialize functions
     of a dict of tensorizers."""
     initializers = []
-    for init in [tensorizer.initialize() for tensorizer in tensorizers.values()]:
+    for init in [
+        tensorizer.initialize(from_scratch=from_scratch)
+        if hasattr(tensorizer, "vocab")
+        else tensorizer.initialize()
+        for tensorizer in tensorizers.values()
+    ]:
         try:
             init.send(None)  # kick
             initializers.append(init)
diff --git a/pytext/workflow.py b/pytext/workflow.py
@@ -227,10 +227,7 @@ def test_model_from_snapshot_path(
 
     if isinstance(task, (NewTask, NewDisjointMultitask)):
         data_source = _get_data_source(
-            test_path,
-            getattr(train_config.task.data, "source", None),
-            field_names,
-            task,
+            test_path, train_config.task.data, field_names, task
         )
         test_results = task.test(data_source)
     else:
@@ -240,7 +237,16 @@ def test_model_from_snapshot_path(
     return test_results, test_out_path, metric_channels
 
 
-def _get_data_source(test_path, source_config, field_names, task):
+def _get_data_source(test_path, data_config, field_names, task):
+    if hasattr(data_config, "data_dict_config"):
+        # it's multiple data
+        if data_config.test_key:
+            source_config = data_config.data_dict_config[data_config.test_key].source
+        else:
+            source_config = next(iter(data_config.data_dict_config.values())).source
+    else:
+        source_config = getattr(data_config, "source", None)
+
     if isinstance(task, NewDisjointMultitask):
         # Cannot easily specify a single data source for multitask
         assert not test_path
@@ -277,10 +283,7 @@ def get_logits(
     if isinstance(task, NewTask):
         task.model.eval()
         data_source = _get_data_source(
-            test_path,
-            getattr(train_config.task.data, "source", None),
-            field_names,
-            task,
+            test_path, train_config.task.data, field_names, task
         )
         task.data.batcher = Batcher()
         task.data.sort_key = None