Back out "[PyText] remove SeqNNTask_Deprecated" (#947)

hudeven · facebook-github-bot · commit 8aded258608e · 2019-09-03T20:31:07.000-07:00
Summary: Pull Request resolved: #947 Original commit changeset: 04ec5a6458a8 Reviewed By: liaimi Differential Revision: D17176512 fbshipit-source-id: cd2c0ccd6eee6b245eb5725ee65f608ecd65e49f
diff --git a/pytext/builtin_task.py b/pytext/builtin_task.py
@@ -27,6 +27,7 @@
     SemanticParsingTask,
     SemanticParsingTask_Deprecated,
     SeqNNTask,
+    SeqNNTask_Deprecated,
     SquadQATask,
     WordTaggingTask,
     WordTaggingTask_Deprecated,
@@ -83,6 +84,7 @@ def register_builtin_tasks():
             SemanticParsingTask,
             SemanticParsingTask_Deprecated,
             SeqNNTask,
+            SeqNNTask_Deprecated,
             SquadQATask,
             WordTaggingTask,
             WordTaggingTask_Deprecated,
diff --git a/pytext/config/test/json_config/v6.json b/pytext/config/test/json_config/v6.json
@@ -102,5 +102,31 @@
       },
       "version": 6
     }
+  },
+  {
+    "original": {
+      "task": {
+        "SeqNNTask": {
+          "data_handler": {
+            "train_path": "tests/data/train_data_tiny.tsv",
+            "eval_path": "tests/data/test_data_tiny.tsv",
+            "test_path": "tests/data/test_data_tiny.tsv"
+          }
+        }
+      },
+      "version": 5
+    },
+    "adapted": {
+      "task": {
+        "SeqNNTask_Deprecated": {
+          "data_handler": {
+            "train_path": "tests/data/train_data_tiny.tsv",
+            "eval_path": "tests/data/test_data_tiny.tsv",
+            "test_path": "tests/data/test_data_tiny.tsv"
+          }
+        }
+      },
+      "version": 6
+    }
   }
 ]
diff --git a/pytext/data/__init__.py b/pytext/data/__init__.py
@@ -15,6 +15,7 @@
 from .disjoint_multitask_data_handler import DisjointMultitaskDataHandler
 from .doc_classification_data_handler import DocClassificationDataHandler, RawData
 from .joint_data_handler import JointModelDataHandler
+from .seq_data_handler import SeqModelDataHandler
 from .tensorizers import Tensorizer
 
 
@@ -37,5 +38,6 @@
     "RandomizedBatchSampler",
     "RawData",
     "RoundRobinBatchSampler",
+    "SeqModelDataHandler",
     "Tensorizer",
 ]
diff --git a/pytext/data/seq_data_handler.py b/pytext/data/seq_data_handler.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, Dict, List
+
+from pytext.common.constants import DatasetFieldName, DFColumn
+from pytext.config import ConfigBase
+from pytext.config.doc_classification import ModelInput
+from pytext.config.field_config import DocLabelConfig, FeatureConfig
+from pytext.data.featurizer import InputRecord
+from pytext.fields import (
+    DocLabelField,
+    Field,
+    FloatVectorField,
+    RawField,
+    SeqFeatureField,
+)
+from pytext.utils import data
+
+from .joint_data_handler import JointModelDataHandler
+
+
+SEQ_LENS = "seq_lens"
+
+
+class SeqModelDataHandler(JointModelDataHandler):
+    class Config(JointModelDataHandler.Config):
+        columns_to_read: List[str] = [DFColumn.DOC_LABEL, DFColumn.UTTERANCE]
+        pretrained_embeds_file: str = ""
+
+    FULL_FEATURES = [DatasetFieldName.TEXT_FIELD]
+
+    @classmethod
+    def from_config(
+        cls,
+        config: Config,
+        feature_config: FeatureConfig,
+        label_config: DocLabelConfig,
+        **kwargs
+    ):
+        word_feat_config = feature_config.word_feat
+        dense_feat_config = feature_config.dense_feat
+        features: Dict[str, Field] = {
+            ModelInput.WORD_FEAT: SeqFeatureField(
+                pretrained_embeddings_path=word_feat_config.pretrained_embeddings_path,
+                embed_dim=word_feat_config.embed_dim,
+                embedding_init_strategy=word_feat_config.embedding_init_strategy,
+                vocab_file=word_feat_config.vocab_file,
+                vocab_size=word_feat_config.vocab_size,
+                vocab_from_train_data=word_feat_config.vocab_from_train_data,
+            )
+        }
+        if dense_feat_config:
+            features[ModelInput.DENSE_FEAT] = FloatVectorField(
+                dim=dense_feat_config.dim,
+                dim_error_check=dense_feat_config.dim_error_check,
+            )
+
+        labels: Dict[str, Field] = {DocLabelConfig._name: DocLabelField()}
+        extra_fields: Dict[str, Field] = {DatasetFieldName.UTTERANCE_FIELD: RawField()}
+
+        return cls(
+            raw_columns=config.columns_to_read,
+            labels=labels,
+            features=features,
+            extra_fields=extra_fields,
+            shuffle=config.shuffle,
+            train_path=config.train_path,
+            eval_path=config.eval_path,
+            test_path=config.test_path,
+            train_batch_size=config.train_batch_size,
+            eval_batch_size=config.eval_batch_size,
+            test_batch_size=config.test_batch_size,
+            **kwargs
+        )
+
+    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
+        sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE])
+
+        features_list = [
+            self.featurizer.featurize(InputRecord(raw_text=utterance))
+            for utterance in sequence
+        ]
+        res = {
+            # features
+            ModelInput.WORD_FEAT: [utterance.tokens for utterance in features_list],
+            # labels
+            DatasetFieldName.DOC_LABEL_FIELD: row_data[DFColumn.DOC_LABEL],
+            DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE],
+        }
+        if DFColumn.DENSE_FEAT in row_data:
+            res[ModelInput.DENSE_FEAT] = row_data.get(DFColumn.DENSE_FEAT)
+        return res
diff --git a/pytext/data/test/seq_data_handler_test.py b/pytext/data/test/seq_data_handler_test.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import unittest
+
+import numpy as np
+from pytext.common.constants import DFColumn
+from pytext.config.field_config import DocLabelConfig, FeatureConfig
+from pytext.data import SeqModelDataHandler
+from pytext.data.featurizer import SimpleFeaturizer
+
+
+class SeqModelDataHandlerTest(unittest.TestCase):
+    def setUp(self):
+        self.train_data = [
+            {
+                DFColumn.DOC_LABEL: "cu:discuss_where",
+                DFColumn.UTTERANCE: '["where do you wanna meet?", "MPK"]',
+            }
+        ]
+
+        self.eval_data = [
+            {
+                DFColumn.DOC_LABEL: "cu:discuss_where",
+                DFColumn.UTTERANCE: '["how about SF?", "sounds good"]',
+            },
+            {DFColumn.DOC_LABEL: "cu:other", DFColumn.UTTERANCE: '["lol"]'},
+        ]
+
+        self.test_data = [
+            {
+                DFColumn.DOC_LABEL: "cu:discuss_where",
+                DFColumn.UTTERANCE: '["MPK sounds good to me"]',
+            },
+            {
+                DFColumn.DOC_LABEL: "cu:other",
+                DFColumn.UTTERANCE: '["great", "awesome"]',
+            },
+        ]
+
+        self.dh = SeqModelDataHandler.from_config(
+            SeqModelDataHandler.Config(),
+            FeatureConfig(),
+            DocLabelConfig(),
+            featurizer=SimpleFeaturizer.from_config(
+                SimpleFeaturizer.Config(), FeatureConfig()
+            ),
+        )
+
+    def test_intermediate_result(self):
+        data = self.dh.gen_dataset(self.train_data)
+        self.assertListEqual(
+            data.examples[0].word_feat,
+            [["where", "do", "you", "wanna", "meet?"], ["mpk"]],
+        )
+
+    def test_process_data(self):
+        self.dh.init_metadata_from_raw_data(
+            self.train_data, self.eval_data, self.test_data
+        )
+        train_iter = self.dh.get_train_iter_from_raw_data(self.train_data, 1)
+        for input, target, _ in train_iter:
+            np.testing.assert_array_almost_equal(
+                input[0][0].numpy(), [[6, 2, 7, 5, 3], [4, 1, 1, 1, 1]]
+            )
+            np.testing.assert_array_almost_equal(input[1].numpy(), [2])
+            np.testing.assert_array_almost_equal(target[0].numpy(), [0])
diff --git a/pytext/exporters/test/text_model_exporter_test.py b/pytext/exporters/test/text_model_exporter_test.py
@@ -16,6 +16,7 @@
 from pytext.builtin_task import (
     DocClassificationTask_Deprecated,
     IntentSlotTask,
+    SeqNNTask_Deprecated,
     WordTaggingTask_Deprecated,
 )
 from pytext.common.constants import DatasetFieldName
@@ -663,8 +664,7 @@ def test_joint_export_to_caffe2(
         test_num_chars=st.integers(1, 7),
         test_num_seq=st.integers(1, 7),
     )
-    # TODO: migrate to SeqNNTask T48066205
-    def DISABLED_test_seq_nn_export_to_caffe2(
+    def test_seq_nn_export_to_caffe2(
         self,
         export_num_words,
         export_num_dict_feat,
@@ -676,8 +676,6 @@ def DISABLED_test_seq_nn_export_to_caffe2(
         test_num_chars,
         test_num_seq,
     ):
-        # SeqNNTask_Deprecated is removed, set to None to fix lint
-        SeqNNTask_Deprecated = None
         config = self._get_config(SeqNNTask_Deprecated.Config, SEQ_NN_CONFIG)
         metadata = self._get_seq_metadata(num_doc_classes, 0)
         py_model = create_model(config.model, config.features, metadata)
diff --git a/pytext/models/seq_models/seqnn.py b/pytext/models/seq_models/seqnn.py
@@ -10,6 +10,22 @@
 from pytext.models.representations.seq_rep import SeqRepresentation
 
 
+class SeqNNModel_Deprecated(Model):
+    """
+    Classification model with sequence of utterances as input.
+    It uses a docnn model (CNN or LSTM) to generate vector representation
+    for each sequence, and then use an LSTM or BLSTM to capture the dynamics
+    and produce labels for each sequence.
+
+    DEPRECATED: Use SeqNNModel
+    """
+
+    class Config(ConfigBase):
+        representation: SeqRepresentation.Config = SeqRepresentation.Config()
+        output_layer: ClassificationOutputLayer.Config = ClassificationOutputLayer.Config()
+        decoder: MLPDecoder.Config = MLPDecoder.Config()
+
+
 class SeqNNModel(DocModel):
     """
     Classification model with sequence of utterances as input.
diff --git a/pytext/task/tasks.py b/pytext/task/tasks.py
@@ -10,6 +10,7 @@
     CompositionalDataHandler,
     DocClassificationDataHandler,
     JointModelDataHandler,
+    SeqModelDataHandler,
 )
 from pytext.data.bert_tensorizer import BERTTensorizer
 from pytext.data.data import Data
@@ -54,7 +55,7 @@
     RNNGParser_Deprecated,
 )
 from pytext.models.seq_models.contextual_intent_slot import ContextualIntentSlotModel
-from pytext.models.seq_models.seqnn import SeqNNModel
+from pytext.models.seq_models.seqnn import SeqNNModel, SeqNNModel_Deprecated
 from pytext.models.word_model import WordTaggingModel, WordTaggingModel_Deprecated
 from pytext.task import Task_Deprecated
 from pytext.task.new_task import NewTask
@@ -251,6 +252,18 @@ class Config(NewTask.Config):
         )
 
 
+class SeqNNTask_Deprecated(Task_Deprecated):
+    class Config(Task_Deprecated.Config):
+        model: SeqNNModel_Deprecated.Config = SeqNNModel_Deprecated.Config()
+        trainer: Trainer.Config = Trainer.Config()
+        labels: DocLabelConfig = DocLabelConfig()
+        data_handler: SeqModelDataHandler.Config = SeqModelDataHandler.Config()
+        metric_reporter: ClassificationMetricReporter.Config = (
+            ClassificationMetricReporter.Config()
+        )
+        exporter: Optional[DenseFeatureExporter.Config] = None
+
+
 class SeqNNTask(NewTask):
     class Config(NewTask.Config):
         model: SeqNNModel.Config = SeqNNModel.Config()