Fixes for LayoutLM (#7318)

sgugger · web-flow · commit 01f0fd0babaf · 2020-09-22T10:37:11.000-04:00
diff --git a/src/transformers/configuration_layoutlm.py b/src/transformers/configuration_layoutlm.py
@@ -40,40 +40,40 @@ class LayoutLMConfig(BertConfig):
 
 
     Args:
-        vocab_size (:obj:`int`, optional, defaults to 30522):
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the LayoutLM model. Defines the different tokens that
             can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
-        hidden_size (:obj:`int`, optional, defaults to 768):
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, optional, defaults to 12):
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, optional, defaults to 3072):
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler.
-            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
             The maximum sequence length that this model might ever be used with.
             Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, optional, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-        initializer_range (:obj:`float`, optional, defaults to 0.02):
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-        max_2d_position_embeddings (:obj:`int`, optional, defaults to 1024):
+        max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
             The maximum value that the 2D position embedding might ever used.
             Typically set this to something large just in case (e.g., 1024).
 
-    Example::
+    Examples::
 
         >>> from transformers import LayoutLMModel, LayoutLMConfig
 
diff --git a/src/transformers/modeling_layoutlm.py b/src/transformers/modeling_layoutlm.py
@@ -118,6 +118,7 @@ def forward(
         return embeddings
 
 
+# Copied from transformers.modeling_bert.BertSelfAttention with Bert->LayoutLM
 class LayoutLMSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -172,6 +173,7 @@ def forward(
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in LayoutLMModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -195,6 +197,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.modeling_bert.BertSelfOutput with Bert->LayoutLM
 class LayoutLMSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -209,6 +212,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
+# Copied from transformers.modeling_bert.BertAttention with Bert->LayoutLM
 class LayoutLMAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -256,6 +260,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.modeling_bert.BertIntermediate
 class LayoutLMIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -271,6 +276,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.modeling_bert.BertOutput with Bert->LayoutLM
 class LayoutLMOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -285,6 +291,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
+# Copied from transformers.modeling_bert.BertLayer with Bert->LayoutLM
 class LayoutLMLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -344,6 +351,7 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
+# Copied from transformers.modeling_bert.BertEncoder with Bert->LayoutLM
 class LayoutLMEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -408,6 +416,7 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.modeling_bert.BertPooler
 class LayoutLMPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -423,6 +432,7 @@ def forward(self, hidden_states):
         return pooled_output
 
 
+# Copied from transformers.modeling_bert.BertPredictionHeadTransform with Bert->LayoutLM
 class LayoutLMPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -440,6 +450,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.modeling_bert.BertLMPredictionHead with Bert->LayoutLM
 class LayoutLMLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -460,6 +471,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.modeling_bert.BertOnlyMLMHead with Bert->LayoutLM
 class LayoutLMOnlyMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -470,28 +482,6 @@ def forward(self, sequence_output):
         return prediction_scores
 
 
-class LayoutLMOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class LayoutLMPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = LayoutLMLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
 class LayoutLMPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for downloading and loading pretrained models.
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
@@ -142,7 +142,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
         return position_ids.unsqueeze(0).expand(input_shape)
 
 
-# Copied from transformers.modeling_bert.BertSelfAttention
+# Copied from transformers.modeling_bert.BertSelfAttention with Bert->Roberta
 class RobertaSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -197,7 +197,7 @@ def forward(
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.