Integrate XLM-R into PyText (#1120)

Kartikay Khandelwal · facebook-github-bot · commit ce61992b1959 · 2019-11-08T09:25:26.000-08:00
Summary: Pull Request resolved: #1120 Adding the ability to load and finetune XLM-R models in PyText. Reviewed By: rutyrinott Differential Revision: D18382033 fbshipit-source-id: 157a53fb44b46452fed7005db9682c9dc46f28da
diff --git a/pytext/data/roberta_tensorizer.py b/pytext/data/roberta_tensorizer.py
@@ -19,7 +19,7 @@ class Config(BERTTensorizerBase.Config):
         vocab_file: str = (
             "manifold://pytext_training/tree/static/vocabs/bpe/gpt2/dict.txt"
         )
-        tokenizer: GPT2BPETokenizer.Config = GPT2BPETokenizer.Config()
+        tokenizer: Tokenizer.Config = GPT2BPETokenizer.Config()
         max_seq_len: int = 256
 
     @classmethod
diff --git a/pytext/models/roberta.py b/pytext/models/roberta.py
@@ -57,17 +57,25 @@ class RoBERTaEncoder(RoBERTaEncoderBase):
     """A PyTorch RoBERTa implementation"""
 
     class Config(RoBERTaEncoderBase.Config):
+        embedding_dim: int = 768
+        vocab_size: int = 50265
         num_encoder_layers: int = 12
         num_attention_heads: int = 12
         model_path: str = (
             "manifold://pytext_training/tree/static/models/roberta_base_torch.pt"
         )
+        # Loading the state dict of the model depends on whether the model was
+        # previously finetuned in PyText or not. If it was finetuned then we
+        # dont need to translate the state dict and can just load it`
+        # directly.
+        is_finetuned: bool = False
 
     def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None:
         super().__init__(config, output_encoded_layers=output_encoded_layers)
         # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
         self.encoder = SentenceEncoder(
             transformer=Transformer(
+                vocab_size=config.vocab_size,
                 embedding_dim=config.embedding_dim,
                 layers=[
                     TransformerLayer(
@@ -84,7 +92,13 @@ def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None
             config.model_path,
             map_location=lambda s, l: default_restore_location(s, "cpu"),
         )
-        self.encoder.load_roberta_state_dict(roberta_state["model"])
+        # In case the model has previously been loaded in PyText and finetuned,
+        # then we dont need to do the special state dict translation. Load
+        # it directly
+        if not config.is_finetuned:
+            self.encoder.load_roberta_state_dict(roberta_state["model"])
+        else:
+            self.load_state_dict(roberta_state)
         self.representation_dim = self.encoder.transformer.token_embedding.weight.size(
             -1
         )

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ class Config(BERTTensorizerBase.Config):`
`19`	`19`	`vocab_file: str = (`
`20`	`20`	`"manifold://pytext_training/tree/static/vocabs/bpe/gpt2/dict.txt"`
`21`	`21`	`)`
`22`		`- tokenizer: GPT2BPETokenizer.Config = GPT2BPETokenizer.Config()`
	`22`	`+ tokenizer: Tokenizer.Config = GPT2BPETokenizer.Config()`
`23`	`23`	`max_seq_len: int = 256`
`24`	`24`
`25`	`25`	`@classmethod`