diff --git a/docs/code/modules.rst b/docs/code/modules.rst index a1a8b1392..eada424b4 100644 --- a/docs/code/modules.rst +++ b/docs/code/modules.rst @@ -64,6 +64,11 @@ Encoders .. autoclass:: texar.torch.modules.BERTEncoder :members: +:hidden:`ELMoEncoder` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.torch.modules.ELMoEncoder + :members: + :hidden:`RoBERTaEncoder` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: texar.torch.modules.RoBERTaEncoder @@ -283,6 +288,11 @@ Pre-trained .. autoclass:: texar.torch.modules.PretrainedBERTMixin :members: +:hidden:`PretrainedELMoMixin` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.torch.modules.PretrainedELMoMixin + :members: + :hidden:`PretrainedRoBERTaMixin` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: texar.torch.modules.PretrainedRoBERTaMixin diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 38a5a84af..3b2871bd2 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -72,3 +72,4 @@ tokenizer wordpiece unigram TF +convnet diff --git a/requirements.txt b/requirements.txt index efdba2f84..22f7f8cdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy >= 1.15.4 mypy_extensions >= 0.4.1 regex >= 2018.01.10 sentencepiece >= 0.1.8 +h5py >= 2.10.0 diff --git a/setup.py b/setup.py index 85fef9cb4..3b86d933c 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ install_requires=[ 'regex>=2018.01.10', 'numpy', + 'h5py>=2.10.0', 'requests', 'funcsigs', 'sentencepiece>=0.1.8', diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py new file mode 100644 index 000000000..9f51168a0 --- /dev/null +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py @@ -0,0 +1,131 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of pre-trained ELMo tokenizer. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/elmo_indexer.py` +""" +from typing import Dict, List, Optional + +import torch + +from torch.nn.utils.rnn import pad_sequence + + +__all__ = [ + "ELMoCharacterMapper", + "batch_to_ids", +] + + +def _make_bos_eos(character: int, + padding_character: int, + beginning_of_word_character: int, + end_of_word_character: int, + max_word_length: int): + char_ids = [padding_character] * max_word_length + char_ids[0] = beginning_of_word_character + char_ids[1] = character + char_ids[2] = end_of_word_character + return char_ids + + +class ELMoCharacterMapper: + r"""Maps individual tokens to sequences of character ids, compatible with + ELMo. To be consistent with previously trained models, we include it here as + special of existing character indexers. + + We allow to add optional additional special tokens with designated + character ids with `tokens_to_add`. + """ + + max_word_length = 50 + + # char ids 0-255 come from utf-8 encoding bytes + # assign 256-300 to special chars + beginning_of_sentence_character = 256 # + end_of_sentence_character = 257 # + beginning_of_word_character = 258 # + end_of_word_character = 259 # + padding_character = 260 # + + beginning_of_sentence_characters = _make_bos_eos( + beginning_of_sentence_character, + padding_character, + beginning_of_word_character, + end_of_word_character, + max_word_length, + ) + end_of_sentence_characters = _make_bos_eos( + end_of_sentence_character, + padding_character, + beginning_of_word_character, + end_of_word_character, + max_word_length, + ) + + bos_token = "" + eos_token = "" + + def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None: + self.tokens_to_add = tokens_to_add or {} + + def convert_word_to_char_ids(self, word: str) -> List[int]: + if word in self.tokens_to_add: + char_ids = [self.padding_character] * self.max_word_length + char_ids[0] = self.beginning_of_word_character + char_ids[1] = self.tokens_to_add[word] + char_ids[2] = self.end_of_word_character + elif word == self.bos_token: + char_ids = self.beginning_of_sentence_characters + elif word == self.eos_token: + char_ids = self.end_of_sentence_characters + else: + word_encoded = word.encode("utf-8", "ignore")[: ( + self.max_word_length - 2)] + char_ids = [self.padding_character] * self.max_word_length + char_ids[0] = self.beginning_of_word_character + for k, chr_id in enumerate(word_encoded, start=1): + char_ids[k] = chr_id + char_ids[len(word_encoded) + 1] = self.end_of_word_character + + # +1 one for masking + return [c + 1 for c in char_ids] + + def __eq__(self, other) -> bool: + if isinstance(self, other.__class__): + return self.__dict__ == other.__dict__ + return NotImplemented + + +def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: + r"""Converts a batch of tokenized sentences to a tensor representing the + sentences with encoded characters (len(batch), max sentence length, + max word length). + + Args: + batch: A list of tokenized sentences. + + Returns: + A tensor of padded character ids. + """ + res = [] + mapper = ELMoCharacterMapper() + for sentence in batch: + character_ids = [mapper.convert_word_to_char_ids(token) + for token in sentence] + res.append(torch.tensor(character_ids)) + + return pad_sequence(res, batch_first=True) diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py new file mode 100644 index 000000000..32e2c7a24 --- /dev/null +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py @@ -0,0 +1,72 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for the utils of pre-trained ELMo tokenizer. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py` +""" + +import unittest + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( + ELMoCharacterMapper, batch_to_ids) + + +class ELMoTokenizerUtilsTest(unittest.TestCase): + + def test_bos_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids('') + # [, , , , ... ] + expected_indices = [259, 257, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) + self.assertEqual(indices, expected_indices) + + def test_eos_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids('') + expected_indices = [259, 258, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) + self.assertEqual(indices, expected_indices) + + def test_unicode_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids(chr(256) + "t") + expected_indices = [259, 197, 129, 117, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) + self.assertEqual(indices, expected_indices) + + def test_additional_tokens(self): + mapper = ELMoCharacterMapper(tokens_to_add={"": 1}) + indices = mapper.convert_word_to_char_ids("") + expected_indices = [259, 2, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) + self.assertEqual(indices, expected_indices) + + def test_batch_to_ids(self): + sentences = [['First', 'sentence', '.'], ['Another', '.']] + indices = batch_to_ids(sentences) + expected_indices = [[ + [259, 71, 106, 115, 116, 117, 260] + [261] * 43, + [259, 116, 102, 111, 117, 102, 111, 100, 102, 260] + [261] * 40, + [259, 47, 260] + [261] * 47], [ + [259, 66, 111, 112, 117, 105, 102, 115, 260] + [261] * 41, + [259, 47, 260] + [261] * 47, + [0] * 50]] + self.assertEqual(indices.tolist(), expected_indices) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/encoders/__init__.py b/texar/torch/modules/encoders/__init__.py index ce69fd985..1031dab5c 100644 --- a/texar/torch/modules/encoders/__init__.py +++ b/texar/torch/modules/encoders/__init__.py @@ -17,6 +17,7 @@ from texar.torch.modules.encoders.bert_encoder import * from texar.torch.modules.encoders.conv_encoders import * +from texar.torch.modules.encoders.elmo_encoder import * from texar.torch.modules.encoders.encoder_base import * from texar.torch.modules.encoders.gpt2_encoder import * from texar.torch.modules.encoders.multihead_attention import * diff --git a/texar/torch/modules/encoders/elmo_encoder.py b/texar/torch/modules/encoders/elmo_encoder.py new file mode 100644 index 000000000..98c05f8a1 --- /dev/null +++ b/texar/torch/modules/encoders/elmo_encoder.py @@ -0,0 +1,318 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ELMo encoder. +""" +import json +import os +import tempfile +import warnings + +from typing import Any, Dict, List, Optional, Union + +import torch + +from torch.nn.modules import Dropout + +from texar.torch.modules.encoders.encoder_base import EncoderBase +from texar.torch.modules.pretrained.elmo import PretrainedELMoMixin +from texar.torch.modules.pretrained.elmo_utils import ( + _ElmoBiLm, ScalarMix, remove_sentence_boundaries) + +__all__ = [ + "ELMoEncoder", +] + + +class ELMoEncoder(EncoderBase, PretrainedELMoMixin): + r"""ELMo model for encoding sequences. Please see + :class:`~texar.torch.modules.PretrainedELMoMixin` for a brief description + of ELMo. + + Args: + pretrained_model_name (optional): a `str`, the name + of pre-trained model (e.g., ``elmo-small``). Please refer to + :class:`~texar.torch.modules.PretrainedELMoMixin` for + all supported models. + If `None`, the model name in :attr:`hparams` is used. + cache_dir (optional): the path to a folder in which the + pre-trained models will be cached. If `None` (default), + a default directory (``texar_data`` folder under user's home + directory) will be used. + hparams (dict or HParams, optional): Hyperparameters. Missing + hyperparameter will be set to default values. See + :meth:`default_hparams` for the hyperparameter structure + and default values. + """ + def __init__(self, + pretrained_model_name: Optional[str] = None, + cache_dir: Optional[str] = None, + hparams=None): + super().__init__(hparams=hparams) + + self.load_pretrained_config(pretrained_model_name, cache_dir) + + options_file = None + weight_file = None + tmp_dir = tempfile.TemporaryDirectory() + if self.pretrained_model_dir is not None: + info = list(os.walk(self.pretrained_model_dir)) + root, _, files = info[0] + for file in files: + if file.endswith('options.json'): + options_file = os.path.join(root, file) + if file.endswith('weights.hdf5'): + weight_file = os.path.join(root, file) + else: + with open(os.path.join(tmp_dir.name, 'options.json'), "w") as fp: + json.dump(self.hparams.encoder.todict(), fp) + options_file = os.path.join(tmp_dir.name, 'options.json') + + assert options_file is not None + self._elmo_lstm = _ElmoBiLm( + options_file, weight_file, + requires_grad=self.hparams.requires_grad, + vocab_to_cache=self.hparams.vocab_to_cache) + tmp_dir.cleanup() + + self._has_cached_vocab = self.hparams.vocab_to_cache is not None + self._keep_sentence_boundaries = self.hparams.keep_sentence_boundaries + self._dropout = Dropout(p=self.hparams.dropout) + self._scalar_mixes: Any = [] + for k in range(self.hparams.num_output_representations): + scalar_mix = ScalarMix( + self._elmo_lstm.num_layers, + do_layer_norm=self.hparams.do_layer_norm, + initial_scalar_parameters=self.hparams.scalar_mix_parameters, + trainable=self.hparams.scalar_mix_parameters is None) + self.add_module("scalar_mix_{}".format(k), scalar_mix) + self._scalar_mixes.append(scalar_mix) + + @staticmethod + def default_hparams(): + r"""Returns a dictionary of hyperparameters with default values. + + * The encoder arch is determined by the constructor argument + :attr:`pretrained_model_name` if it's specified. In this case, + `hparams` are ignored. + * Otherwise, the encoder arch is determined by + `hparams['pretrained_model_name']` if it's specified. All other + configurations in `hparams` are ignored. + * If the above two are `None`, the encoder arch is defined by the + configurations in `hparams` and weights are randomly initialized. + + .. code-block:: python + + { + "pretrained_model_name": "elmo-small", + "encoder": { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], + [5, 256], [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + "num_output_representations": 2, + "requires_grad": False, + "do_layer_norm": False, + "dropout": 0.5, + "vocab_to_cache": None, + "keep_sentence_boundaries": False, + "scalar_mix_parameters": None, + "name": "elmo_encoder", + } + + Here: + + The default parameters are values for ELMo small model. + + `"pretrained_model_name"`: str or None + The name of the pre-trained ELMo model. If None, the model + will be randomly initialized. + + `"encoder"`: dict + Hyperparameters for ELMo encoder. + + `"num_output_representations"`: int + The number of ELMo representation to output with different linear + weighted combination of the 3 layers (i.e., character-convnet + output, the first LSTM output, the second LSTM output). + + `"requires_grad"`: bool + If True, compute gradient of ELMo parameters for fine tuning. + + `"do_layer_norm"`: bool + Should we apply layer normalization (passed to `ScalarMix`)? + + `"dropout"`: float + The dropout to be applied to the ELMo representations. + + `"vocab_to_cache"`: List[string] + A list of words to pre-compute and cache character convolutions + for. If you use this option, ELMo expects that you pass word + indices of shape `(batch_size, timesteps)` to forward, instead + of character indices. If you use this option and pass a word which + was not pre-cached, this will break. + + `"keep_sentence_boundaries"`: bool + If True, the representation of the sentence boundary tokens are + not removed. + + `"scalar_mix_parameters"`: List[float] + If not `None`, use these scalar mix parameters to weight the + representations produced by different layers. These mixing weights + are not updated during training. The mixing weights here should be + the unnormalized (i.e., pre-softmax) weights. So, if you wanted to + use only the 1st layer of a 2-layer ELMo, you can set this to + [-9e10, 1, -9e10 ]. + + `"name"`: str + Name of the module. + """ + return { + 'pretrained_model_name': 'elmo-small', + 'encoder': { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], + [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + 'num_output_representations': 2, + 'requires_grad': False, + 'do_layer_norm': False, + 'dropout': 0.5, + 'vocab_to_cache': None, + 'keep_sentence_boundaries': False, + 'scalar_mix_parameters': None, + 'name': 'elmo_encoder', + '@no_typecheck': ['pretrained_model_name'] + } + + def forward(self, # type: ignore + inputs: torch.Tensor, + word_inputs: Optional[torch.Tensor] = None) -> \ + Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: + r"""Encodes the inputs. + + Args: + inputs: Shape `[batch_size, max_time, 50]` of character ids + representing the current batch. + word_inputs: If you passed a cached vocab, you can in addition pass + a tensor of shape `[batch_size, max_time]`, which represent + word ids which have been pre-cached. + + Returns: + A Dictionary with keys: + + - :attr:`elmo_representations`: A `num_output_representations` list + of ELMo representations for the input sequence. Each + representation is shape `[batch_size, max_time, embedding_dim]` + + - :attr:`mask`: Shape `(batch_size, timesteps)` long tensor + with sequence mask. + """ + # reshape the input if needed + original_shape = inputs.size() + if len(original_shape) > 3: + timesteps, num_characters = original_shape[-2:] + reshaped_inputs = inputs.view(-1, timesteps, num_characters) + else: + reshaped_inputs = inputs + + if word_inputs is not None: + original_word_size = word_inputs.size() + if self._has_cached_vocab and len(original_word_size) > 2: + reshaped_word_inputs = word_inputs.view(-1, + original_word_size[-1]) + elif not self._has_cached_vocab: + warnings.warn( + "Word inputs were passed to ELMo but it does not have a " + "cached vocab.") + reshaped_word_inputs = None # type: ignore + else: + reshaped_word_inputs = word_inputs + else: + reshaped_word_inputs = word_inputs # type: ignore + + # run the biLM + bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs) + layer_activations = bilm_output["activations"] + mask_with_bos_eos = bilm_output["mask"] + + # compute the elmo representations + representations = [] + for i in range(len(self._scalar_mixes)): + scalar_mix = getattr(self, "scalar_mix_{}".format(i)) + representation_with_bos_eos = scalar_mix(layer_activations, + mask_with_bos_eos) + if self._keep_sentence_boundaries: + processed_representation = representation_with_bos_eos + processed_mask = mask_with_bos_eos + else: + representation_without_bos_eos, mask_without_bos_eos = \ + remove_sentence_boundaries( + representation_with_bos_eos, mask_with_bos_eos) + processed_representation = representation_without_bos_eos + processed_mask = mask_without_bos_eos + representations.append(self._dropout(processed_representation)) + + # reshape if necessary + if word_inputs is not None and len(original_word_size) > 2: + mask = processed_mask.view(original_word_size) + elmo_representations = [ + representation.view(original_word_size + (-1,)) + for representation in representations] + elif len(original_shape) > 3: + mask = processed_mask.view(original_shape[:-1]) + elmo_representations = [ + representation.view(original_shape[:-1] + (-1,)) + for representation in representations] + else: + mask = processed_mask + elmo_representations = representations + + return {"elmo_representations": elmo_representations, "mask": mask} + + @property + def output_size(self): + return self._elmo_lstm.get_output_dim() diff --git a/texar/torch/modules/encoders/elmo_encoder_test.py b/texar/torch/modules/encoders/elmo_encoder_test.py new file mode 100644 index 000000000..04a34b359 --- /dev/null +++ b/texar/torch/modules/encoders/elmo_encoder_test.py @@ -0,0 +1,146 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ELMo Encoder. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` +""" + +import unittest + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids +from texar.torch.modules.encoders.elmo_encoder import ELMoEncoder +from texar.torch.utils.test import pretrained_test + + +class ELMoEncoderTest(unittest.TestCase): + r"""Tests :class:`~texar.torch.modules.ELMoEncoder` class. + """ + + @pretrained_test + def test_model_loading(self): + r"""Tests model loading functionality.""" + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + for pretrained_model_name in ELMoEncoder.available_checkpoints(): + encoder = ELMoEncoder(pretrained_model_name=pretrained_model_name) + _ = encoder(character_ids) + + def test_encode(self): + r"""Tests encoding. + """ + hparams = { + "pretrained_model_name": None, + 'encoder': { + "lstm": { + "cell_clip": 3, + "use_skip_connections": True, + "n_layers": 2, + "proj_clip": 3, + "projection_dim": 16, + "dim": 64 + }, + "char_cnn": { + "embedding": { + "dim": 4 + }, + "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + "n_highway": 2, + "n_characters": 262, + "max_characters_per_token": 50, + "activation": "relu" + } + } + } + encoder = ELMoEncoder(hparams=hparams) + + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + output = encoder(character_ids) + elmo_representations = output["elmo_representations"] + mask = output["mask"] + + assert len(elmo_representations) == 2 + assert list(elmo_representations[0].size()) == [2, 7, 32] + assert list(elmo_representations[1].size()) == [2, 7, 32] + assert list(mask.size()) == [2, 7] + + def test_elmo_keep_sentence_boundaries(self): + hparams = { + "pretrained_model_name": None, + 'encoder': { + "lstm": { + "cell_clip": 3, + "use_skip_connections": True, + "n_layers": 2, + "proj_clip": 3, + "projection_dim": 16, + "dim": 64 + }, + "char_cnn": { + "embedding": { + "dim": 4 + }, + "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + "n_highway": 2, + "n_characters": 262, + "max_characters_per_token": 50, + "activation": "relu" + } + }, + 'dropout': 0.0, + 'keep_sentence_boundaries': True, + } + encoder = ELMoEncoder(hparams=hparams) + + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + output = encoder(character_ids) + elmo_representations = output["elmo_representations"] + mask = output["mask"] + + assert len(elmo_representations) == 2 + # Add 2 to the lengths because we're keeping the start and end of + # sentence tokens. + assert list(elmo_representations[0].size()) == [2, 7 + 2, 32] + assert list(elmo_representations[1].size()) == [2, 7 + 2, 32] + assert list(mask.size()) == [2, 7 + 2] + + @pretrained_test + def test_trainable_variables(self): + encoder = ELMoEncoder() + elmo_grads = [ + param.requires_grad for param in encoder._elmo_lstm.parameters() + ] + assert all(grad is False for grad in elmo_grads) + + encoder = ELMoEncoder(hparams={'requires_grad': True}) + elmo_grads = [ + param.requires_grad for param in encoder._elmo_lstm.parameters() + ] + assert all(grad is True for grad in elmo_grads) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/pretrained/__init__.py b/texar/torch/modules/pretrained/__init__.py index 1f06a87a9..1e0ae19d3 100644 --- a/texar/torch/modules/pretrained/__init__.py +++ b/texar/torch/modules/pretrained/__init__.py @@ -17,6 +17,7 @@ from texar.torch.modules.pretrained.pretrained_base import * from texar.torch.modules.pretrained.bert import * +from texar.torch.modules.pretrained.elmo import * from texar.torch.modules.pretrained.gpt2 import * from texar.torch.modules.pretrained.roberta import * from texar.torch.modules.pretrained.xlnet import * diff --git a/texar/torch/modules/pretrained/elmo.py b/texar/torch/modules/pretrained/elmo.py new file mode 100644 index 000000000..2783aa4e5 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo.py @@ -0,0 +1,104 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of ELMo Modules. +""" + +import json +import os + +from abc import ABC +from typing import Any, Dict + +from texar.torch.modules.pretrained.pretrained_base import PretrainedMixin + +__all__ = [ + "PretrainedELMoMixin", +] + +_ELMo_PATH = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/" + + +class PretrainedELMoMixin(PretrainedMixin, ABC): + r"""A mixin class to support loading pre-trained checkpoints for modules + that implement the ELMo model. + + The ELMo model was proposed in + `Deep contextualized word representations`_ + by `Peters et al.` from Allen Institute for Artificial Intelligence. It is + a deep bidirectional language model (`biLM`), which is pre-trained on a + large text corpus. + + The available ELMo models are as follows: + + * ``elmo-small``: 13.6M parameters, trained on 800M tokens. + * ``elmo-medium``: 28.0M parameters, trained on 800M tokens. + * ``elmo-original``: 93.6M parameters, trained on 800M tokens. + * ``elmo-original-5.5b``: 93.6M parameters, trained on 5.5B tokens. + + We provide the following ELMo classes: + + * :class:`~texar.torch.modules.ELMoEncoder` for text encoding. + + .. _`Deep contextualized word representations`: + https://arxiv.org/abs/1802.05365 + """ + _MODEL_NAME = "ELMo" + _MODEL2URL = { + 'elmo-small': [ + _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/' + 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', + _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/' + 'elmo_2x1024_128_2048cnn_1xhighway_options.json', + ], + 'elmo-medium': [ + _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/' + 'elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5', + _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/' + 'elmo_2x2048_256_2048cnn_1xhighway_options.json', + ], + 'elmo-original': [ + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/' + 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5', + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/' + 'elmo_2x4096_512_2048cnn_2xhighway_options.json', + ], + 'elmo-original-5.5b': [ + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/' + 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5', + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/' + 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', + ], + } + + @classmethod + def _transform_config(cls, pretrained_model_name: str, + cache_dir: str) -> Dict[str, Any]: + info = list(os.walk(cache_dir)) + root, _, files = info[0] + config_path = None + for file in files: + if file.endswith('options.json'): + config_path = os.path.join(root, file) + if config_path is None: + raise ValueError(f"Cannot find the config file in {cache_dir}") + + with open(config_path) as f: + config_elmo = json.loads(f.read()) + + return {'encoder': config_elmo} + + def _init_from_checkpoint(self, pretrained_model_name: str, + cache_dir: str, **kwargs): + return diff --git a/texar/torch/modules/pretrained/elmo_test.py b/texar/torch/modules/pretrained/elmo_test.py new file mode 100644 index 000000000..d31bb1f5a --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_test.py @@ -0,0 +1,71 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ELMo utils. +""" + +import os +import unittest + +from texar.torch.modules.pretrained.elmo import * +from texar.torch.utils.test import pretrained_test + + +class ELMoUtilsTest(unittest.TestCase): + r"""Tests ELMo Utils. + """ + + @pretrained_test + def test_load_pretrained_elmo_AND_transform_elmo_to_texar_config(self): + pretrained_model_dir = PretrainedELMoMixin.download_checkpoint( + pretrained_model_name="elmo-small") + + info = list(os.walk(pretrained_model_dir)) + _, _, files = info[0] + self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', files) + self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_options.json', files) + + model_config = PretrainedELMoMixin._transform_config( + pretrained_model_name="elmo-small", + cache_dir=pretrained_model_dir) + + exp_config = { + 'encoder': { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], + [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + } + + self.assertDictEqual(model_config, exp_config) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/pretrained/elmo_utils.py b/texar/torch/modules/pretrained/elmo_utils.py new file mode 100644 index 000000000..be6e76c5d --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_utils.py @@ -0,0 +1,1710 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of ELMo Modules. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo_lstm.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/encoder_base.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/lstm_cell_with_projection.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/highway.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/scalar_mix.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/time_distributed.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/embedding.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/nn/initializers.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py` +""" +import itertools +import json +import logging + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import h5py +import numpy +import torch + +from torch.nn import ParameterList, Parameter +from torch.nn.functional import embedding +from torch.nn.utils.rnn import ( + pad_packed_sequence, pack_padded_sequence, PackedSequence) + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( + batch_to_ids, ELMoCharacterMapper) +from texar.torch.utils.utils import ( + combine_initial_dims, get_device_of, lazy_groups_of, sort_batch_by_length, + uncombine_initial_dims) + +# pylint: disable=attribute-defined-outside-init,protected-access + +__all__ = [ + "_ElmoBiLm", + "_ElmoCharacterEncoder", + "_EncoderBase", + "ElmoLstm", + "Embedding", + "Highway", + "LstmCellWithProjection", + "ScalarMix", + "TimeDistributed", + "add_sentence_boundary_token_ids", + "block_orthogonal", + "get_dropout_mask", + "remove_sentence_boundaries", +] + + +class _ElmoBiLm(torch.nn.Module): + r"""Run a pre-trained bidirectional language model, outputting the + activations at each layer for weighting together into an ELMo + representation. + + Args: + options_file: ELMo JSON options file + weight_file: ELMo hdf5 weight file + requires_grad: If True, compute gradient of ELMo parameters for fine + tuning. + vocab_to_cache: A list of words to pre-compute and cache character + convolutions for. If you use this option, `_ElmoBiLm` expects that + you pass word indices of shape `(batch_size, timesteps)` to forward, + instead of character indices. If you use this option and pass a word + which wasn't pre-cached, this will break. + """ + def __init__(self, options_file: str, weight_file: Optional[str] = None, + requires_grad: bool = False, + vocab_to_cache: Optional[List[str]] = None) -> None: + super().__init__() + self._token_embedder = _ElmoCharacterEncoder( + options_file, weight_file, requires_grad=requires_grad) + self._requires_grad = requires_grad + # This is an embedding, used to look up cached + # word vectors built from character level cnn embeddings. + self._word_embedding = None + self._bos_embedding: torch.Tensor = None # type: ignore + self._eos_embedding: torch.Tensor = None # type: ignore + if vocab_to_cache: + logging.info( + "Caching character cnn layers for words in vocabulary.") + # This sets 3 attributes, _word_embedding, _bos_embedding and + # _eos_embedding. They are set in the method so they can be accessed + # from outside the constructor. + self.create_cached_cnn_embeddings(vocab_to_cache) + + with open(options_file, "r") as fin: + options = json.load(fin) + if not options["lstm"].get("use_skip_connections"): + raise ValueError( + "We only support pretrained biLMs with residual connections") + self._elmo_lstm = ElmoLstm( + input_size=options["lstm"]["projection_dim"], + hidden_size=options["lstm"]["projection_dim"], + cell_size=options["lstm"]["dim"], + num_layers=options["lstm"]["n_layers"], + memory_cell_clip_value=options["lstm"]["cell_clip"], + state_projection_clip_value=options["lstm"]["proj_clip"], + requires_grad=requires_grad) + + if weight_file is not None: + self._elmo_lstm.load_weights(weight_file) + # Number of representation layers including context independent layer + self.num_layers = options["lstm"]["n_layers"] + 1 + + def get_output_dim(self): + return 2 * self._token_embedder.get_output_dim() + + def forward( # type: ignore + self, inputs: torch.Tensor, word_inputs: Optional[torch.Tensor] = None + ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: + r"""Encodes the inputs. + + Args: + inputs: Shape `(batch_size, timesteps, 50)` of character ids + representing the current batch. + word_inputs: If you passed a cached vocab, you can in addition pass + a tensor of shape `(batch_size, timesteps)`, which represent + word ids which have been pre-cached. + + Returns: + Dict with keys: + + - `'activations'`: A list of activations at each layer of the + network, each of shape `(batch_size, timesteps + 2, + embedding_dim)`. + - `'mask'`: Shape `(batch_size, timesteps + 2)` long tensor with + sequence mask. + + Note that the output tensors all include additional special begin + and end of sequence markers. + """ + if self._word_embedding is not None and word_inputs is not None: + try: + mask_without_bos_eos = (word_inputs > 0).long() + # The character cnn part is cached - just look it up. + embedded_inputs = self._word_embedding( + word_inputs) + # shape (batch_size, timesteps + 2, embedding_dim) + type_representation, mask = add_sentence_boundary_token_ids( + embedded_inputs, mask_without_bos_eos, self._bos_embedding, + self._eos_embedding) + except RuntimeError: + # Back off to running the character convolutions, + # as we might not have the words in the cache. + token_embedding = self._token_embedder(inputs) + mask = token_embedding["mask"] + type_representation = token_embedding["token_embedding"] + else: + token_embedding = self._token_embedder(inputs) + mask = token_embedding["mask"] + type_representation = token_embedding["token_embedding"] + lstm_outputs = self._elmo_lstm(type_representation, mask) + + # Prepare the output. The first layer is duplicated. + # Because of minor differences in how masking is applied depending + # on whether the char cnn layers are cached, we'll be defensive and + # multiply by the mask here. It's not strictly necessary, as the + # mask passed on is correct, but the values in the padded areas + # of the char cnn representations can change. + output_tensors = [ + torch.cat([type_representation, type_representation], dim=-1) + * mask.float().unsqueeze(-1)] + for layer_activations in torch.chunk(lstm_outputs, + lstm_outputs.size(0), dim=0): + output_tensors.append(layer_activations.squeeze(0)) + + return {"activations": output_tensors, "mask": mask} + + def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: + r"""Given a list of tokens, this method precomputes word representations + by running just the character convolutions and highway layers of elmo, + essentially creating uncontextual word vectors. On subsequent forward + passes, the word ids are looked up from an embedding, rather than being + computed on the fly via the CNN encoder. + + This function sets 3 attributes: + + _word_embedding: The word embedding for each word in the tokens passed + to this method. + _bos_embedding: The embedding for the BOS token. + _eos_embedding: The embedding for the EOS token. + + Args: + tokens: A list of tokens to precompute character convolutions for. + """ + tokens = [ELMoCharacterMapper.bos_token, + ELMoCharacterMapper.eos_token] + tokens + timesteps = 32 + batch_size = 32 + chunked_tokens = lazy_groups_of(iter(tokens), timesteps) + + all_embeddings = [] + device = get_device_of(next(self.parameters())) + for batch in lazy_groups_of(chunked_tokens, batch_size): + # Shape (batch_size, timesteps, 50) + batched_tensor = batch_to_ids(batch) + # NOTE: This device check is for when a user calls this method + # having already placed the model on a device. If this is called in + # the constructor, it will probably happen on the CPU. This isn't + # too bad, because it's only a few convolutions and will likely + # be very fast. + if device >= 0: + batched_tensor = batched_tensor.cuda(device) + output = self._token_embedder(batched_tensor) + token_embedding = output["token_embedding"] + mask = output["mask"] + token_embedding, _ = remove_sentence_boundaries(token_embedding, + mask) + all_embeddings.append(token_embedding.view( + -1, token_embedding.size(-1))) + full_embedding = torch.cat(all_embeddings, 0) + + # We might have some trailing embeddings from padding in the batch, so + # we clip the embedding and lookup to the right size. + full_embedding = full_embedding[: len(tokens), :] + embedding_ = full_embedding[2: len(tokens), :] + vocab_size, embedding_dim = list(embedding_.size()) + + self._bos_embedding = full_embedding[0, :] + self._eos_embedding = full_embedding[1, :] + self._word_embedding = Embedding( # type: ignore + vocab_size, + embedding_dim, + weight=embedding_.data, + trainable=self._requires_grad, + padding_index=0) + + +class _ElmoCharacterEncoder(torch.nn.Module): + r"""Compute context insensitive token representation using pre-trained biLM. + + This embedder has input character ids of size + `(batch_size, sequence_length, 50)` + and returns `(batch_size, sequence_length + 2, embedding_dim)`, where + `embedding_dim` is specified in the options file (typically 512). + + We add special entries at the beginning and end of each sequence + corresponding to and , the beginning and end of sentence tokens. + + Args: + options_file: ELMo JSON options file. + weight_file: ELMo hdf5 weight file. + requires_grad: If True, compute gradient of ELMo parameters for fine + tuning. + """ + def __init__(self, options_file: str, weight_file: Optional[str] = None, + requires_grad: bool = False) -> None: + super().__init__() + + with open(options_file, "r") as fin: + self._options = json.load(fin) + self._weight_file = weight_file + + self.output_dim = self._options["lstm"]["projection_dim"] + self.requires_grad = requires_grad + + if weight_file is not None: + self._load_weights() + else: + # Do not load the weights + self._load_weights(False) + + # Cache the arrays for use in forward -- +1 due to masking. + self._beginning_of_sentence_characters = torch.from_numpy( + numpy.array( + ELMoCharacterMapper.beginning_of_sentence_characters) + 1) + self._end_of_sentence_characters = torch.from_numpy( + numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1) + + def get_output_dim(self): + return self.output_dim + + def forward(self, # type: ignore + inputs: torch.Tensor) -> Dict[str, torch.Tensor]: + r"""Compute context insensitive token embeddings for ELMo + representations. + + Args: + inputs: Shape `(batch_size, sequence_length, 50)` of character ids + representing the current batch. + + Returns: + Dict with keys: + + - `'token_embedding'`: Shape `(batch_size, sequence_length + 2, + embedding_dim)` tensor with context insensitive token + representations. + - `'mask'`: Shape `(batch_size, sequence_length + 2)` long tensor + with sequence mask. + """ + # Add BOS/EOS + mask = ((inputs > 0).long().sum(dim=-1) > 0).long() + character_ids_with_bos_eos, mask_with_bos_eos = \ + add_sentence_boundary_token_ids( + inputs, mask, self._beginning_of_sentence_characters, + self._end_of_sentence_characters) + + # the character id embedding + max_chars_per_token = \ + self._options["char_cnn"]["max_characters_per_token"] + # (batch_size * sequence_length, max_chars_per_token, embed_dim) + character_embedding = torch.nn.functional.embedding( + character_ids_with_bos_eos.view(-1, max_chars_per_token), + self._char_embedding_weights) + + # run convolutions + cnn_options = self._options["char_cnn"] + activation: Callable + if cnn_options["activation"] == "tanh": + activation = torch.tanh + elif cnn_options["activation"] == "relu": + activation = torch.nn.functional.relu + else: + raise ValueError("Unknown activation") + + # (batch_size * sequence_length, embed_dim, max_chars_per_token) + character_embedding = torch.transpose(character_embedding, 1, 2) + convs = [] + for i in range(len(self._convolutions)): + conv = getattr(self, "char_conv_{}".format(i)) + convolved = conv(character_embedding) + # (batch_size * sequence_length, n_filters for this width) + convolved, _ = torch.max(convolved, dim=-1) + convolved = activation(convolved) + convs.append(convolved) + + # (batch_size * sequence_length, n_filters) + token_embedding = torch.cat(convs, dim=-1) + # apply the highway layers (batch_size * sequence_length, n_filters) + token_embedding = self._highways(token_embedding) + # final projection (batch_size * sequence_length, embedding_dim) + token_embedding = self._projection(token_embedding) + # reshape to (batch_size, sequence_length, embedding_dim) + batch_size, sequence_length, _ = character_ids_with_bos_eos.size() + + return {"mask": mask_with_bos_eos, + "token_embedding": token_embedding.view( + batch_size, sequence_length, -1)} + + def _load_weights(self, load_weights=True): + self._load_char_embedding(load_weights) + self._load_cnn_weights(load_weights) + self._load_highway(load_weights) + self._load_projection(load_weights) + + def _load_char_embedding(self, load_weights): + if load_weights: + with h5py.File(self._weight_file, "r") as fin: + char_embed_weights = fin["char_embed"][...] + + weights = numpy.zeros( + (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), + dtype="float32") + weights[1:, :] = char_embed_weights + + self._char_embedding_weights = torch.nn.Parameter( + torch.FloatTensor(weights), requires_grad=self.requires_grad) + else: + weights = numpy.zeros(( + self._options['char_cnn']['n_characters'], + self._options['char_cnn']['embedding']['dim']), dtype="float32") + self._char_embedding_weights = torch.nn.Parameter( + torch.FloatTensor(weights), requires_grad=self.requires_grad) + + def _load_cnn_weights(self, load_weights): + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + char_embed_dim = cnn_options["embedding"]["dim"] + + convolutions = [] + for i, (width, num) in enumerate(filters): + conv = torch.nn.Conv1d(in_channels=char_embed_dim, out_channels=num, + kernel_size=width, bias=True) + if load_weights: + # load the weights + with h5py.File(self._weight_file, "r") as fin: + weight = fin["CNN"]["W_cnn_{}".format(i)][...] + bias = fin["CNN"]["b_cnn_{}".format(i)][...] + + w_reshaped = numpy.transpose(weight.squeeze(axis=0), + axes=(2, 1, 0)) + if w_reshaped.shape != tuple(conv.weight.data.shape): + raise ValueError("Invalid weight file") + conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) + conv.bias.data.copy_(torch.FloatTensor(bias)) + conv.weight.requires_grad = self.requires_grad + conv.bias.requires_grad = self.requires_grad + + convolutions.append(conv) + self.add_module("char_conv_{}".format(i), conv) + self._convolutions = convolutions + + def _load_highway(self, load_weights): + # the highway layers have same dimensionality as the number of cnn + # filters + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + n_filters = sum(f[1] for f in filters) + n_highway = cnn_options["n_highway"] + + # create the layers, and load the weights + self._highways = Highway(n_filters, n_highway, + activation=torch.nn.functional.relu) + if load_weights: + for k in range(n_highway): + # The AllenNLP highway is one matrix multplication with + # concatenation of transform and carry weights. + with h5py.File(self._weight_file, "r") as fin: + # The weights are transposed due to multiplication order + # assumptions in tf vs pytorch (tf.matmul(X, W) vs + # pytorch.matmul(W, X)) + w_transform = numpy.transpose( + fin["CNN_high_{}".format(k)]["W_transform"][...]) + # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but + # tf is (1 - g) * x + g * f(x) + w_carry = -1.0 * numpy.transpose( + fin["CNN_high_{}".format(k)]["W_carry"][...]) + weight = numpy.concatenate([w_transform, w_carry], axis=0) + self._highways._layers[k].weight.data.copy_( + torch.FloatTensor(weight)) + self._highways._layers[k].weight.requires_grad = \ + self.requires_grad + b_transform = \ + fin["CNN_high_{}".format(k)]["b_transform"][...] + b_carry = \ + -1.0 * fin["CNN_high_{}".format(k)]["b_carry"][...] + bias = numpy.concatenate([b_transform, b_carry], axis=0) + self._highways._layers[k].bias.data.copy_( + torch.FloatTensor(bias)) + self._highways._layers[k].bias.requires_grad = \ + self.requires_grad + + def _load_projection(self, load_weights): + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + n_filters = sum(f[1] for f in filters) + + self._projection = torch.nn.Linear(n_filters, self.output_dim, + bias=True) + if load_weights: + with h5py.File(self._weight_file, "r") as fin: + weight = fin["CNN_proj"]["W_proj"][...] + bias = fin["CNN_proj"]["b_proj"][...] + self._projection.weight.data.copy_(torch.FloatTensor( + numpy.transpose(weight))) + self._projection.bias.data.copy_(torch.FloatTensor(bias)) + self._projection.weight.requires_grad = self.requires_grad + self._projection.bias.requires_grad = self.requires_grad + + +RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] +RnnStateStorage = Tuple[torch.Tensor, ...] + + +class _EncoderBase(torch.nn.Module): + r"""This abstract class serves as a base for `Encoder`. + + Additionally, this class provides functionality for sorting sequences by + length so they can be consumed by PyTorch RNN classes, which require their + inputs to be sorted by length. Finally, it also provides optional + statefulness to all of it's subclasses by allowing the caching and + retrieving of the hidden states of RNNs. + """ + def __init__(self, stateful: bool = False) -> None: + super().__init__() + self.stateful = stateful + self._states: Optional[RnnStateStorage] = None + + def sort_and_run_forward( + self, + module: Callable[[PackedSequence, Optional[RnnState]], + Tuple[Union[PackedSequence, torch.Tensor], + RnnState]], + inputs: torch.Tensor, mask: torch.Tensor, + hidden_state: Optional[RnnState] = None): + r"""This function exists because PyTorch RNNs require that their inputs + be sorted before being passed as input. As all of our Seq2xxxEncoders + use this functionality, it is provided in a base class. This method can + be called on any module which takes as input a `PackedSequence` and + some `hidden_state`, which can either be a tuple of tensors or a tensor. + + As all of our Seq2xxxEncoders have different return types, we return + `sorted` outputs from the module, which is called directly. + Additionally, we return the indices into the batch dimension required + to restore the tensor to it's correct, unsorted order and the number of + valid batch elements (i.e the number of elements in the batch which are + not completely masked). This un-sorting and re-padding of the module + outputs is left to the subclasses because their outputs have different + types and handling them smoothly here is difficult. + + Args: + module: A function to run on the inputs. In most cases, this is a + `torch.nn.Module`. + inputs: A tensor of shape `(batch_size, sequence_length, + embedding_size)` representing the inputs to the Encoder. + mask: A tensor of shape `(batch_size, sequence_length)`, + representing masked and non-masked elements of the sequence for + each element in the batch. + hidden_state: A single tensor of shape `(num_layers, batch_size, + hidden_size)` representing the state of an RNN with or a tuple + of tensors of shapes `(num_layers, batch_size, hidden_size)` and + `(num_layers, batch_size, memory_size)`, representing the hidden + state and memory state of an LSTM-like RNN. + + Returns: + module_output: A Tensor or `PackedSequence` representing the output + of the PyTorch Module. The batch size dimension will be equal to + `num_valid`, as sequences of zero length are clipped off before + the module is called, as PyTorch cannot handle zero length + sequences. + final_states: A Tensor representing the hidden state of the PyTorch + Module. This can either be a single tensor of shape + `(num_layers, num_valid, hidden_size)`, for instance in the case + of a GRU, or a tuple of tensors, such as those required for an + LSTM. + restoration_indices: A tensor of shape `(batch_size,)`, describing + the re-indexing required to transform the outputs back to their + original batch order. + """ + # In some circumstances you may have sequences of zero length. + # `pack_padded_sequence` requires all sequence lengths to be > 0, so + # remove sequences of zero length before calling self._module, then + # fill with zeros. + + # First count how many sequences are empty. + batch_size = mask.size(0) + num_valid = torch.sum(mask[:, 0]).int().item() + + sequence_lengths = mask.long().sum(-1) + (sorted_inputs, sorted_sequence_lengths, restoration_indices, + sorting_indices) = sort_batch_by_length(inputs, sequence_lengths) + + # Now create a PackedSequence with only the non-empty, sorted sequences. + packed_sequence_input = pack_padded_sequence( + sorted_inputs[:num_valid, :, :], + sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True) + # Prepare the initial states. + if not self.stateful: + if hidden_state is None: + initial_states: Any = hidden_state + elif isinstance(hidden_state, tuple): + initial_states = [state.index_select( + 1, sorting_indices)[:, :num_valid, :].contiguous() + for state in hidden_state] + else: + initial_states = hidden_state.index_select(1, sorting_indices)[ + :, :num_valid, :].contiguous() + else: + initial_states = self._get_initial_states(batch_size, num_valid, + sorting_indices) + + # Actually call the module on the sorted PackedSequence. + module_output, final_states = module(packed_sequence_input, + initial_states) + return module_output, final_states, restoration_indices + + def _get_initial_states(self, batch_size: int, num_valid: int, + sorting_indices: torch.LongTensor) -> \ + Optional[RnnState]: + r"""Returns an initial state for use in an RNN. Additionally, this + method handles the batch size changing across calls by mutating the + state to append initial states for new elements in the batch. Finally, + it also handles sorting the states with respect to the sequence lengths + of elements in the batch and removing rows which are completely padded. + Importantly, this `mutates` the state if the current batch size is + larger than when it was previously called. + + Args: + batch_size: The batch size can change size across calls to stateful + RNNs, so we need to know if we need to expand or shrink the + states before returning them. Expanded states will be set to + zero. + num_valid: The batch may contain completely padded sequences which + get removed before the sequence is passed through the encoder. + We also need to clip these off of the state too. + sorting_indices: Pytorch RNNs take sequences sorted by length. When + we return the states to be used for a given call to + `module.forward`, we need the states to match up to the sorted + sequences, so before returning them, we sort the states using + the same indices used to sort the sequences. + + Returns: + This method has a complex return type because it has to deal with + the first time it is called, when it has no state, and the fact that + types of RNN have heterogeneous states. + + If it is the first time the module has been called, it returns + `None`, regardless of the type of the `Module`. + + Otherwise, for LSTMs, it returns a tuple of `torch.Tensors` with + shape `(num_layers, num_valid, state_size)` and `(num_layers, + num_valid, memory_size)` respectively, for GRUs, it returns a single + `torch.Tensor` of shape `(num_layers, num_valid, state_size)`. + """ + # We don't know the state sizes the first time calling forward, + # so we let the module define what it's initial hidden state looks like. + if self._states is None: + return None + + # Otherwise, we have some previous states. + if batch_size > self._states[0].size(1): + # This batch is larger than the all previous states. + # If so, resize the states. + num_states_to_concat = batch_size - self._states[0].size(1) + resized_states = [] + # state has shape (num_layers, batch_size, hidden_size) + for state in self._states: + # This _must_ be inside the loop because some + # RNNs have states with different last dimension sizes. + zeros = state.new_zeros(state.size(0), num_states_to_concat, + state.size(2)) + resized_states.append(torch.cat([state, zeros], 1)) + self._states = tuple(resized_states) + correctly_shaped_states = self._states + elif batch_size < self._states[0].size(1): + # This batch is smaller than the previous one. + correctly_shaped_states = tuple(state[:, :batch_size, :] for state + in self._states) + else: + correctly_shaped_states = self._states + + # At this point, our states are of shape (num_layers, batch_size, + # hidden_size). However, the encoder uses sorted sequences and + # additionally removes elements of the batch which are fully padded. + # We need the states to match up to these sorted and filtered + # sequences, so we do that in the next two blocks before returning the + # state/s. + if len(self._states) == 1: + # GRUs only have a single state. This `unpacks` it from the + # tuple and returns the tensor directly. + correctly_shaped_state = correctly_shaped_states[0] + sorted_state = correctly_shaped_state.index_select( + 1, sorting_indices) + return sorted_state[:, :num_valid, :].contiguous() + else: + # LSTMs have a state tuple of (state, memory). + sorted_states = [state.index_select(1, sorting_indices) for state in + correctly_shaped_states] + return tuple(state[:, :num_valid, :].contiguous() # type: ignore + for state in sorted_states) + + def _update_states(self, final_states: RnnStateStorage, + restoration_indices: torch.LongTensor) -> None: + r"""After the RNN has run forward, the states need to be updated. + This method just sets the state to the updated new state, performing + several pieces of book-keeping along the way - namely, unsorting the + states and ensuring that the states of completely padded sequences are + not updated. Finally, it also detaches the state variable from the + computational graph, such that the graph can be garbage collected after + each batch iteration. + + Args: + final_states: The hidden states returned as output from the RNN. + restoration_indices: The indices that invert the sorting used in + `sort_and_run_forward` to order the states with respect to the + lengths of the sequences in the batch. + """ + new_unsorted_states = [state.index_select(1, restoration_indices) for + state in final_states] + + if self._states is None: + # We don't already have states, so just set the + # ones we receive to be the current state. + self._states = tuple(state.data for state in new_unsorted_states) + else: + # Now we've sorted the states back so that they correspond to the + # original indices, we need to figure out what states we need to + # update, because if we didn't use a state for a particular row, + # we want to preserve its state. Thankfully, the rows which are + # all zero in the state correspond exactly to those which aren't + # used, so we create masks of shape (new_batch_size,), denoting + # which states were used in the RNN computation. + current_state_batch_size = self._states[0].size(1) + new_state_batch_size = final_states[0].size(1) + # Masks for the unused states of shape (1, new_batch_size, 1) + used_new_rows_mask = [(state[0, :, :].sum(-1) != 0.0).float().view( + 1, new_state_batch_size, 1) for state in new_unsorted_states] + new_states = [] + if current_state_batch_size > new_state_batch_size: + # The new state is smaller than the old one, + # so just update the indices which we used. + for old_state, new_state, used_mask in zip( + self._states, new_unsorted_states, used_new_rows_mask): + # zero out all rows in the previous state + # which _were_ used in the current state. + masked_old_state = \ + old_state[:, :new_state_batch_size, :] * (1 - used_mask) + # The old state is larger, so update the relevant parts of + # it. + old_state[:, :new_state_batch_size, :] = \ + new_state + masked_old_state + new_states.append(old_state.detach()) + else: + # The states are the same size, so we just have to + # deal with the possibility that some rows weren't used. + new_states = [] + for old_state, new_state, used_mask in zip( + self._states, new_unsorted_states, used_new_rows_mask): + # zero out all rows which _were_ used in the current state. + masked_old_state = old_state * (1 - used_mask) + # The old state is larger, so update the relevant parts of + # it. + new_state += masked_old_state + new_states.append(new_state.detach()) + + # It looks like there should be another case handled here - when + # the current_state_batch_size < new_state_batch_size. However, + # this never happens, because the states themeselves are mutated + # by appending zeros when calling _get_inital_states, meaning that + # the new states are either of equal size, or smaller, in the case + # that there are some unused elements (zero-length) for the RNN + # computation. + self._states = tuple(new_states) + + def reset_states(self, mask: Optional[torch.Tensor] = None) -> None: + r"""Resets the internal states of a stateful encoder. + + Args: + mask: A tensor of shape `(batch_size,)` indicating which states + should be reset. If not provided, all states will be reset. + """ + if mask is None: + self._states = None + else: + # state has shape (num_layers, batch_size, hidden_size). We reshape + # mask to have shape (1, batch_size, 1) so that operations + # broadcast properly. + mask_batch_size = mask.size(0) + mask = mask.float().view(1, mask_batch_size, 1) + new_states = [] + assert self._states is not None + for old_state in self._states: + old_state_batch_size = old_state.size(1) + if old_state_batch_size != mask_batch_size: + raise ValueError( + f"Trying to reset states using mask with incorrect " + f"batch size. " + f"Expected batch size: {old_state_batch_size}. " + f"Provided batch size: {mask_batch_size}.") + new_state = (1 - mask) * old_state + new_states.append(new_state.detach()) + self._states = tuple(new_states) + + +class ElmoLstm(_EncoderBase): + r"""A stacked, bidirectional LSTM which uses `LstmCellWithProjection`'s + with highway layers between the inputs to layers. The inputs to the forward + and backward directions are independent - forward and backward states are + not concatenated between layers. + + Additionally, this LSTM maintains its `own` state, which is updated every + time `forward` is called. It is dynamically resized for different batch + sizes and is designed for use with non-continuous inputs (i.e inputs which + aren't formatted as a stream, such as text used for a language modeling + task, which is how stateful RNNs are typically used). + This is non-standard, but can be thought of as having an "end of sentence" + state, which is carried across different sentences. + + Args: + input_size: The dimension of the inputs to the LSTM. + hidden_size: The dimension of the outputs of the LSTM. + cell_size: The dimension of the memory cell of the + `LstmCellWithProjection`. + num_layers: The number of bidirectional LSTMs to use. + requires_grad: If True, compute gradient of ELMo parameters for fine + tuning. + recurrent_dropout_probability: The dropout probability to be used in a + dropout scheme as stated in [A Theoretically Grounded Application of + Dropout in Recurrent Neural Networks] + (https://arxiv.org/abs/1512.05287). + state_projection_clip_value: The magnitude with which to clip the + `hidden_state` after projecting it. + memory_cell_clip_value: The magnitude with which to clip the memory + cell. + """ + + def __init__(self, input_size: int, hidden_size: int, cell_size: int, + num_layers: int, requires_grad: bool = False, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None) -> None: + super().__init__(stateful=True) + + # Required to be wrapped with a `PytorchSeq2SeqWrapper`. + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.cell_size = cell_size + self.requires_grad = requires_grad + + forward_layers = [] + backward_layers = [] + + lstm_input_size = input_size + go_forward = True + for layer_index in range(num_layers): + forward_layer = LstmCellWithProjection( + lstm_input_size, hidden_size, cell_size, go_forward, + recurrent_dropout_probability, memory_cell_clip_value, + state_projection_clip_value) + backward_layer = LstmCellWithProjection( + lstm_input_size, hidden_size, cell_size, not go_forward, + recurrent_dropout_probability, memory_cell_clip_value, + state_projection_clip_value) + lstm_input_size = hidden_size + + self.add_module("forward_layer_{}".format(layer_index), + forward_layer) + self.add_module("backward_layer_{}".format(layer_index), + backward_layer) + forward_layers.append(forward_layer) + backward_layers.append(backward_layer) + self.forward_layers = forward_layers + self.backward_layers = backward_layers + + def forward(self, inputs: torch.Tensor, # type: ignore + mask: torch.LongTensor) -> torch.Tensor: + r"""Encodes the inputs. + + Args: + inputs: A Tensor of shape + `(batch_size, sequence_length, hidden_size)`. + mask: A binary mask of shape `(batch_size, sequence_length)` + representing the non-padded elements in each sequence in the + batch. + + Returns: + A `torch.Tensor` of shape `(num_layers, batch_size, sequence_length, + hidden_size)`, where the `num_layers` dimension represents the LSTM + output from that layer. + """ + batch_size, total_sequence_length = mask.size() + stacked_sequence_output, final_states, restoration_indices = \ + self.sort_and_run_forward(self._lstm_forward, inputs, mask) + + num_layers, num_valid, returned_timesteps, encoder_dim = \ + stacked_sequence_output.size() + # Add back invalid rows which were removed in the call to + # sort_and_run_forward. + if num_valid < batch_size: + zeros = stacked_sequence_output.new_zeros( + num_layers, batch_size - num_valid, returned_timesteps, + encoder_dim) + stacked_sequence_output = torch.cat( + [stacked_sequence_output, zeros], 1) + # The states also need to have invalid rows added back. + new_states = [] + for state in final_states: + state_dim = state.size(-1) + zeros = state.new_zeros(num_layers, batch_size - num_valid, + state_dim) + new_states.append(torch.cat([state, zeros], 1)) + final_states = new_states + + # It's possible to need to pass sequences which are padded to longer + # than the max length of the sequence to a Seq2StackEncoder. However, + # packing and unpacking the sequences mean that the returned tensor + # won't include these dimensions, because the RNN did not need to + # process them. We add them back on in the form of zeros here. + sequence_length_difference = total_sequence_length - returned_timesteps + if sequence_length_difference > 0: + zeros = stacked_sequence_output.new_zeros( + num_layers, batch_size, sequence_length_difference, + stacked_sequence_output[0].size(-1)) + stacked_sequence_output = torch.cat( + [stacked_sequence_output, zeros], 2) + self._update_states(final_states, restoration_indices) + + # Restore the original indices and return the sequence. + # Has shape (num_layers, batch_size, sequence_length, hidden_size) + return stacked_sequence_output.index_select(1, restoration_indices) + + def _lstm_forward(self, inputs: PackedSequence, + initial_state: Optional[ + Tuple[torch.Tensor, torch.Tensor]] = None) -> \ + Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Encodes the inputs. + + Args: + inputs: A batch first `PackedSequence` to run the stacked LSTM over. + initial_state: A tuple (state, memory) representing the initial + hidden state and memory of the LSTM, with shape + `(num_layers, batch_size, 2 * hidden_size)` and + `(num_layers, batch_size, 2 * cell_size)` respectively. + + Returns: + output_sequence: The encoded sequence of shape + `(num_layers, batch_size, sequence_length, hidden_size)`. + final_states: The per-layer final (state, memory) states of the + LSTM, with shape `(num_layers, batch_size, 2 * hidden_size)` and + `(num_layers, batch_size, 2 * cell_size)` respectively. The + last dimension is duplicated because it contains the + state/memory for both the forward and backward layers. + """ + if initial_state is None: + hidden_states: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = \ + [None] * len(self.forward_layers) + elif initial_state[0].size()[0] != len(self.forward_layers): + raise ValueError( + "Initial states were passed to forward() but the number of " + "initial states does not match the number of layers.") + else: + hidden_states = list(zip(initial_state[0].split(1, 0), + initial_state[1].split(1, 0))) + + inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) + forward_output_sequence = inputs + backward_output_sequence = inputs + + final_states = [] + sequence_outputs = [] + for layer_index, state in enumerate(hidden_states): + forward_layer = getattr(self, "forward_layer_{}".format( + layer_index)) + backward_layer = getattr(self, "backward_layer_{}".format( + layer_index)) + + forward_cache = forward_output_sequence + backward_cache = backward_output_sequence + + forward_state = None + backward_state = None + if state is not None: + forward_hidden_state, backward_hidden_state = state[0].split( + self.hidden_size, 2) + forward_memory_state, backward_memory_state = state[1].split( + self.cell_size, 2) + forward_state = (forward_hidden_state, forward_memory_state) + backward_state = (backward_hidden_state, backward_memory_state) + + forward_output_sequence, forward_state = forward_layer( + forward_output_sequence, batch_lengths, forward_state) + backward_output_sequence, backward_state = backward_layer( + backward_output_sequence, batch_lengths, backward_state) + # Skip connections, just adding the input to the output. + if layer_index != 0: + forward_output_sequence += forward_cache + backward_output_sequence += backward_cache + + sequence_outputs.append( + torch.cat([forward_output_sequence, backward_output_sequence], + -1)) + # Append the state tuples in a list, so that we can return + # the final states for all the layers. + final_states.append( + (torch.cat([forward_state[0], backward_state[0]], -1), + torch.cat([forward_state[1], backward_state[1]], -1))) + + stacked_sequence_outputs: torch.FloatTensor = torch.stack( + sequence_outputs) + # Stack the hidden state and memory for each layer into 2 tensors of + # shape (num_layers, batch_size, hidden_size) and + # (num_layers, batch_size, cell_size) respectively. + final_hidden_states, final_memory_states = zip(*final_states) + final_state_tuple: Tuple[torch.FloatTensor, torch.FloatTensor] = ( + torch.cat(final_hidden_states, 0), + torch.cat(final_memory_states, 0)) + return stacked_sequence_outputs, final_state_tuple + + def load_weights(self, weight_file: str) -> None: + r"""Load the pre-trained weights from the file. + """ + requires_grad = self.requires_grad + + with h5py.File(weight_file, "r") as fin: + for i_layer, lstms in enumerate(zip(self.forward_layers, + self.backward_layers)): + for j_direction, lstm in enumerate(lstms): + # lstm is an instance of LSTMCellWithProjection + cell_size = lstm.cell_size + dataset = fin["RNN_%s" % j_direction]["RNN"][ + "MultiRNNCell"]["Cell%s" % i_layer]["LSTMCell"] + # tensorflow packs together both W and U matrices into one + # matrix, but pytorch maintains individual matrices. In + # addition, tensorflow packs the gates as input, memory, + # forget, output but pytorch uses input, forget, memory, + # output. So we need to modify the weights. + tf_weights = numpy.transpose(dataset["W_0"][...]) + torch_weights = tf_weights.copy() + + # split the W from U matrices + input_size = lstm.input_size + input_weights = torch_weights[:, :input_size] + recurrent_weights = torch_weights[:, input_size:] + tf_input_weights = tf_weights[:, :input_size] + tf_recurrent_weights = tf_weights[:, input_size:] + + # handle the different gate order convention + for torch_w, tf_w in [ + [input_weights, tf_input_weights], + [recurrent_weights, tf_recurrent_weights]]: + torch_w[(1 * cell_size): (2 * cell_size), :] = tf_w[ + (2 * cell_size): (3 * cell_size), :] + torch_w[(2 * cell_size): (3 * cell_size), :] = tf_w[ + (1 * cell_size): (2 * cell_size), :] + + lstm.input_linearity.weight.data.copy_(torch.FloatTensor( + input_weights)) + lstm.state_linearity.weight.data.copy_(torch.FloatTensor( + recurrent_weights)) + lstm.input_linearity.weight.requires_grad = requires_grad + lstm.state_linearity.weight.requires_grad = requires_grad + + # the bias weights + tf_bias = dataset["B"][...] + # tensorflow adds 1.0 to forget gate bias instead of + # modifying the parameters... + tf_bias[(2 * cell_size): (3 * cell_size)] += 1 + torch_bias = tf_bias.copy() + torch_bias[(1 * cell_size): (2 * cell_size)] = tf_bias[ + (2 * cell_size): (3 * cell_size)] + torch_bias[(2 * cell_size): (3 * cell_size)] = tf_bias[ + (1 * cell_size): (2 * cell_size)] + lstm.state_linearity.bias.data.copy_(torch.FloatTensor( + torch_bias)) + lstm.state_linearity.bias.requires_grad = requires_grad + + # the projection weights + proj_weights = numpy.transpose(dataset["W_P_0"][...]) + lstm.state_projection.weight.data.copy_(torch.FloatTensor( + proj_weights)) + lstm.state_projection.weight.requires_grad = requires_grad + + +class LstmCellWithProjection(torch.nn.Module): + r"""An LSTM with Recurrent Dropout and a projected and clipped hidden state + and memory. Note: this implementation is slower than the native PyTorch + LSTM because it cannot make use of CUDNN optimizations for stacked RNNs due + to and variational dropout and the custom nature of the cell state. + + Args: + input_size: The dimension of the inputs to the LSTM. + hidden_size: The dimension of the outputs of the LSTM. + cell_size: The dimension of the memory cell used for the LSTM. + go_forward: The direction in which the LSTM is applied to the sequence. + Forwards by default, or backwards if False. + recurrent_dropout_probability: The dropout probability to be used in a + dropout scheme as stated in [A Theoretically Grounded Application of + Dropout in Recurrent Neural Networks] + (https://arxiv.org/abs/1512.05287). Implementation wise, this simply + applies a fixed dropout mask per sequence to the recurrent + connection of the LSTM. + state_projection_clip_value: The magnitude with which to clip the + `hidden_state` after projecting it. + memory_cell_clip_value: The magnitude with which to clip the memory + cell. + + Returns: + output_accumulator: The outputs of the LSTM for each timestep. A tensor + of shape `(batch_size, max_timesteps, hidden_size)` where for a + given batch element, all outputs past the sequence length for that + batch are zero tensors. + final_state: The final (state, memory) states of the LSTM, with shape + `(1, batch_size, hidden_size)` and `(1, batch_size, cell_size)` + respectively. The first dimension is 1 in order to match the PyTorch + API for returning stacked LSTM states. + """ + + def __init__(self, input_size: int, hidden_size: int, cell_size: int, + go_forward: bool = True, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None) -> None: + super().__init__() + # Required to be wrapped with a `PytorchSeq2SeqWrapper`. + self.input_size = input_size + self.hidden_size = hidden_size + self.cell_size = cell_size + + self.go_forward = go_forward + self.state_projection_clip_value = state_projection_clip_value + self.memory_cell_clip_value = memory_cell_clip_value + self.recurrent_dropout_probability = recurrent_dropout_probability + + # We do the projections for all the gates all at once. + self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, + bias=False) + self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, + bias=True) + # Additional projection matrix for making the hidden state smaller. + self.state_projection = torch.nn.Linear(cell_size, hidden_size, + bias=False) + self.reset_parameters() + + def reset_parameters(self): + # Use sensible default initializations for parameters. + block_orthogonal(self.input_linearity.weight.data, + [self.cell_size, self.input_size]) + block_orthogonal(self.state_linearity.weight.data, + [self.cell_size, self.hidden_size]) + + self.state_linearity.bias.data.fill_(0.0) + # Initialize forget gate biases to 1.0 as per An Empirical + # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015). + self.state_linearity.bias.data[self.cell_size: + 2 * self.cell_size].fill_(1.0) + + def forward(self, inputs: torch.FloatTensor, # type: ignore + batch_lengths: List[int], + initial_state: Optional[Tuple[torch.Tensor, + torch.Tensor]] = None): + r"""Process the inputs. + + Args: + inputs: A tensor of shape `(batch_size, num_timesteps, input_size)` + to apply the LSTM over. + batch_lengths: A list of length batch_size containing the lengths + of the sequences in batch. + initial_state: A tuple (state, memory) representing the initial + hidden state and memory of the LSTM. The `state` has shape + `(1, batch_size, hidden_size)` and the `memory` has shape + `(1, batch_size, cell_size)`. + + Returns: + output_accumulator: The outputs of the LSTM for each timestep. A + tensor of shape `(batch_size, max_timesteps, hidden_size)` where + for a given batch element, all outputs past the sequence length + for that batch are zero tensors. + final_state: A tuple (state, memory) representing the initial hidden + state and memory of the LSTM. The `state` has shape + `(1, batch_size, hidden_size)` and the `memory` has shape + `(1, batch_size, cell_size)`. + """ + batch_size = inputs.size()[0] + total_timesteps = inputs.size()[1] + output_accumulator = inputs.new_zeros(batch_size, total_timesteps, + self.hidden_size) + if initial_state is None: + full_batch_previous_memory = inputs.new_zeros(batch_size, + self.cell_size) + full_batch_previous_state = inputs.new_zeros(batch_size, + self.hidden_size) + else: + full_batch_previous_state = initial_state[0].squeeze(0) + full_batch_previous_memory = initial_state[1].squeeze(0) + + current_length_index = batch_size - 1 if self.go_forward else 0 + if self.recurrent_dropout_probability > 0.0 and self.training: + dropout_mask = get_dropout_mask( + self.recurrent_dropout_probability, full_batch_previous_state) + else: + dropout_mask = None + + for timestep in range(total_timesteps): + # The index depends on which end we start. + index = timestep if self.go_forward else \ + total_timesteps - timestep - 1 + # What we are doing here is finding the index into the batch + # dimension which we need to use for this timestep, because the + # sequences have variable length, so once the index is greater than + # the length of this particular batch sequence, we no longer need + # to do the computation for this sequence. The key thing to + # recognise here is that the batch inputs must be _ordered_ by + # length from longest (first in batch) to shortest (last) so + # initially, we are going forwards with every sequence and as we + # pass the index at which the shortest elements of the batch finish, + # we stop picking them up for the computation. + if self.go_forward: + while batch_lengths[current_length_index] <= index: + current_length_index -= 1 + # If we're going backwards, we are _picking up_ more indices. + else: + # First conditional: Are we already at the maximum number of + # elements in the batch? + # Second conditional: Does the next shortest sequence beyond + # the current batch index require computation use this timestep? + while (current_length_index < (len(batch_lengths) - 1) + and batch_lengths[current_length_index + 1] > index): + current_length_index += 1 + + # Actually get the slices of the batch which we + # need for the computation at this timestep. + # shape (batch_size, cell_size) + previous_memory = \ + full_batch_previous_memory[0: current_length_index + 1].clone() + # Shape (batch_size, hidden_size) + previous_state = \ + full_batch_previous_state[0: current_length_index + 1].clone() + # Shape (batch_size, input_size) + timestep_input = inputs[0: current_length_index + 1, index] + + # Do the projections for all the gates all at once. + # Both have shape (batch_size, 4 * cell_size) + projected_input = self.input_linearity(timestep_input) + projected_state = self.state_linearity(previous_state) + + # Main LSTM equations using relevant chunks of the big linear + # projections of the hidden state and inputs. + input_gate = torch.sigmoid( + projected_input[:, (0 * self.cell_size): (1 * self.cell_size)] + + projected_state[:, (0 * self.cell_size): (1 * self.cell_size)] + ) + forget_gate = torch.sigmoid( + projected_input[:, (1 * self.cell_size): (2 * self.cell_size)] + + projected_state[:, (1 * self.cell_size): (2 * self.cell_size)] + ) + memory_init = torch.tanh( + projected_input[:, (2 * self.cell_size): (3 * self.cell_size)] + + projected_state[:, (2 * self.cell_size): (3 * self.cell_size)] + ) + output_gate = torch.sigmoid( + projected_input[:, (3 * self.cell_size): (4 * self.cell_size)] + + projected_state[:, (3 * self.cell_size): (4 * self.cell_size)] + ) + memory = input_gate * memory_init + forget_gate * previous_memory + + # Here is the non-standard part of this LSTM cell; first, we clip + # the memory cell, then we project the output of the timestep to a + # smaller size and again clip it. + if self.memory_cell_clip_value: + memory = torch.clamp(memory, -self.memory_cell_clip_value, + self.memory_cell_clip_value) + + # shape (current_length_index, cell_size) + pre_projection_timestep_output = output_gate * torch.tanh(memory) + + # shape (current_length_index, hidden_size) + timestep_output = self.state_projection( + pre_projection_timestep_output) + if self.state_projection_clip_value: + timestep_output = torch.clamp( + timestep_output, -self.state_projection_clip_value, + self.state_projection_clip_value) + + # Only do dropout if the dropout prob is > 0.0 and we are in + # training mode. + if dropout_mask is not None: + timestep_output = \ + timestep_output * dropout_mask[0: current_length_index + 1] + + # We've been doing computation with less than the full batch, so + # here we create a new variable for the the whole batch at this + # timestep and insert the result for the relevant elements of the + # batch into it. + full_batch_previous_memory = full_batch_previous_memory.clone() + full_batch_previous_state = full_batch_previous_state.clone() + full_batch_previous_memory[0: current_length_index + 1] = memory + full_batch_previous_state[0: current_length_index + 1] = \ + timestep_output + output_accumulator[0: current_length_index + 1, index] = \ + timestep_output + + # Mimic the pytorch API by returning state in the following shape: + # (num_layers * num_directions, batch_size, ...). As this + # LSTM cell cannot be stacked, the first dimension here is just 1. + final_state = (full_batch_previous_state.unsqueeze(0), + full_batch_previous_memory.unsqueeze(0)) + return output_accumulator, final_state + + +class Highway(torch.nn.Module): + r"""A [Highway layer](https://arxiv.org/abs/1505.00387) does a gated + combination of a linear transformation and a non-linear transformation of + its input. :math:`y = g * x + (1 - g) * f(A(x))`, where :math:`A` is a + linear transformation, :math:`f` is an element-wise non-linearity, and + :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`. + + This module will apply a fixed number of highway layers to its input, + returning the final result. + + Args: + input_dim: The dimensionality of :math:`x`. We assume the input has + shape `(batch_size, ..., input_dim)`. + num_layers: The number of highway layers to apply to the input. + activation: The non-linearity to use in the highway layers. + """ + + def __init__(self, input_dim: int, num_layers: int = 1, + activation: Callable[[torch.Tensor], torch.Tensor] = + torch.nn.functional.relu) -> None: + super().__init__() + self._input_dim = input_dim + self._layers = torch.nn.ModuleList( + [torch.nn.Linear(input_dim, input_dim * 2) + for _ in range(num_layers)]) + self._activation = activation + for layer in self._layers: + # We should bias the highway layer to just carry its input forward. + # We do that by setting the bias on `B(x)` to be positive, because + # that means `g` will be biased to be high, so we will carry the + # input forward. The bias on `B(x)` is the second half of the + # bias vector in each Linear layer. + layer.bias[input_dim:].data.fill_(1) # type: ignore + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: # type: ignore + current_input = inputs + for layer in self._layers: + projected_input = layer(current_input) + linear_part = current_input + # NOTE: if you modify this, think about whether you should modify + # the initialization above, too. + nonlinear_part, gate = projected_input.chunk(2, dim=-1) + nonlinear_part = self._activation(nonlinear_part) + gate = torch.sigmoid(gate) + current_input = gate * linear_part + (1 - gate) * nonlinear_part + return current_input + + +class Embedding(torch.nn.Module): + r"""A more featureful embedding module than the default in Pytorch. Adds + the ability to: + + 1. embed higher-order inputs + 2. pre-specify the weight matrix + 3. use a non-trainable embedding + 4. project the resultant embeddings to some other dimension (which only + makes sense with non-trainable embeddings). + + Args: + num_embeddings: Size of the dictionary of embeddings (vocabulary size). + embedding_dim: The size of each embedding vector. + projection_dim: If given, we add a projection layer after the embedding + layer. This really only makes sense if `trainable` is `False`. + weight: A pre-initialised weight matrix for the embedding lookup, + allowing the use of pre-trained vectors. + padding_index: If given, pads the output with zeros whenever it + encounters the index. + trainable: Whether or not to optimize the embedding parameters. + max_norm: If given, will renormalize the embeddings to always have a + norm lesser than this. + norm_type: The p of the p-norm to compute for the max_norm option. + scale_grad_by_freq: If given, this will scale gradients by the frequency + of the words in the mini-batch. + sparse: Whether or not the Pytorch backend should use a sparse + representation of the embedding weight. + vocab_namespace: In case of fine-tuning/transfer learning, the model's + embedding matrix needs to be extended according to the size of + extended-vocabulary. To be able to know how much to extend the + embedding-matrix, it's necessary to know which `vocab_namspace` was + used to construct it in the original training. We store + vocab_namespace used during the original training as an attribute, + so that it can be retrieved during fine-tuning. + pretrained_file: Used to keep track of what is the source of the weights + and loading more embeddings at test time. **It does not load the + weights from this pretrained_file.** For that purpose, use + `Embedding.from_params`. + + Returns: + An Embedding module. + """ + + default_implementation = "embedding" + + def __init__(self, + num_embeddings: int, embedding_dim: int, + projection_dim: Optional[int] = None, + weight: Optional[torch.FloatTensor] = None, + padding_index: Optional[int] = None, trainable: bool = True, + max_norm: Optional[float] = None, norm_type: float = 2.0, + scale_grad_by_freq: bool = False, sparse: bool = False, + vocab_namespace: Optional[str] = None, + pretrained_file: Optional[str] = None) -> None: + super().__init__() + self.num_embeddings = num_embeddings + self.padding_index = padding_index + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + self._vocab_namespace = vocab_namespace + self._pretrained_file = pretrained_file + self.output_dim = projection_dim or embedding_dim + + if weight is None: + weight = torch.FloatTensor(num_embeddings, embedding_dim) + self.weight = torch.nn.Parameter(weight, requires_grad=trainable) + torch.nn.init.xavier_uniform_(self.weight) + else: + if weight.size() != (num_embeddings, embedding_dim): + raise ValueError( + "A weight matrix was passed with contradictory embedding " + "shapes.") + self.weight = torch.nn.Parameter(weight, requires_grad=trainable) + + if self.padding_index is not None: + self.weight.data[self.padding_index].fill_(0) + + self._projection = None + if projection_dim: + self._projection = torch.nn.Linear(embedding_dim, projection_dim) + + def forward(self, tokens: torch.Tensor) -> torch.Tensor: # type: ignore + # tokens may have extra dimensions (batch_size, d1, ..., dn, + # sequence_length), but embedding expects (batch_size, sequence_length), + # so pass tokens to util.combine_initial_dims (which is a no-op if + # there are no extra dimensions). Remember the original size. + original_size = tokens.size() + tokens = combine_initial_dims(tokens) + embedded = embedding( + tokens, self.weight, padding_idx=self.padding_index, + max_norm=self.max_norm, norm_type=self.norm_type, + scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse) + # Now (if necessary) add back in the extra dimensions. + embedded = uncombine_initial_dims(embedded, original_size) + + if self._projection: + projection = self._projection + for _ in range(embedded.dim() - 2): + projection = TimeDistributed(projection) # type: ignore + embedded = projection(embedded) + return embedded + + +class TimeDistributed(torch.nn.Module): + r"""Given an input shaped like `(batch_size, time_steps, [rest])` and a + `Module` that takes inputs like `(batch_size, [rest])`, `TimeDistributed` + reshapes the input to be `(batch_size * time_steps, [rest])`, applies the + contained `Module`, then reshapes it back. + + Note that while the above gives shapes with `batch_size` first, this + `Module` also works if `batch_size` is second - we always just combine the + first two dimensions, then split them. + + It also reshapes keyword arguments unless they are not tensors or their + name is specified in the optional `pass_through` iterable. + """ + + def __init__(self, module): + super().__init__() + self._module = module + + def forward(self, *inputs, + pass_through: Optional[List[str]] = None, **kwargs): + pass_through = pass_through or [] + reshaped_inputs = [self._reshape_tensor(input_tensor) + for input_tensor in inputs] + # Need some input to then get the batch_size and time_steps. + some_input = None + if inputs: + some_input = inputs[-1] + reshaped_kwargs = {} + for key, value in kwargs.items(): + if isinstance(value, torch.Tensor) and key not in pass_through: + if some_input is None: + some_input = value + value = self._reshape_tensor(value) + reshaped_kwargs[key] = value + reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) + if some_input is None: + raise RuntimeError("No input tensor to time-distribute") + # Now get the output back into the right shape. + # (batch_size, time_steps, **output_size) + new_size = some_input.size()[:2] + reshaped_outputs.size()[1:] + outputs = reshaped_outputs.contiguous().view(new_size) + return outputs + + @staticmethod + def _reshape_tensor(input_tensor): + input_size = input_tensor.size() + if len(input_size) <= 2: + raise RuntimeError(f"No dimension to distribute: {input_size}") + # Squash batch_size and time_steps into a single axis; result has shape + # (batch_size * time_steps, **input_size). + squashed_shape = [-1] + list(input_size[2:]) + return input_tensor.contiguous().view(*squashed_shape) + + +class ScalarMix(torch.nn.Module): + r"""Computes a parameterised scalar mixture of N tensors, + `mixture = gamma * sum(s_k * tensor_k)` where `s = softmax(w)`, with `w` + and `gamma` scalar parameters. + + In addition, if `do_layer_norm=True` then apply layer normalization to + each tensor before weighting. + """ + def __init__(self, mixture_size: int, do_layer_norm: bool = False, + initial_scalar_parameters: Optional[List[float]] = None, + trainable: bool = True) -> None: + super().__init__() + self.mixture_size = mixture_size + self.do_layer_norm = do_layer_norm + + if initial_scalar_parameters is None: + initial_scalar_parameters = [0.0] * mixture_size + elif len(initial_scalar_parameters) != mixture_size: + raise ValueError( + "Length of initial_scalar_parameters {} differs " + "from mixture_size {}".format(initial_scalar_parameters, + mixture_size)) + self.scalar_parameters = ParameterList([ + Parameter(torch.FloatTensor([initial_scalar_parameters[i]]), + requires_grad=trainable) for i in range(mixture_size)]) + self.gamma = Parameter(torch.FloatTensor([1.0]), + requires_grad=trainable) + + def forward(self, tensors: List[torch.Tensor], # type: ignore + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + r"""Compute a weighted average of the `tensors`. The input tensors can + be any shape with at least two dimensions, but must all be the same + shape. + + When `do_layer_norm=True`, the `mask` is required input. If the + `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the + `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical + case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask` + of shape `(batch_size, timesteps)`. + + When `do_layer_norm=False` the `mask` is ignored. + """ + if len(tensors) != self.mixture_size: + raise ValueError( + "{} tensors were passed, but the module was initialized to " + "mix {} tensors.".format(len(tensors), self.mixture_size)) + + def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): + tensor_masked = tensor * broadcast_mask + mean = torch.sum(tensor_masked) / num_elements_not_masked + variance = ( + torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / + num_elements_not_masked) + return (tensor - mean) / torch.sqrt(variance + 1e-12) + + # pylint: disable=unnecessary-comprehension + normed_weights = torch.nn.functional.softmax( + torch.cat([parameter for parameter in self.scalar_parameters]), + dim=0) + normed_weights = torch.split(normed_weights, split_size_or_sections=1) + + if not self.do_layer_norm: + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * tensor) + return self.gamma * sum(pieces) + else: + assert mask is not None + mask_float = mask.float() + broadcast_mask = mask_float.unsqueeze(-1) + input_dim = tensors[0].size(-1) + num_elements_not_masked = torch.sum(mask_float) * input_dim + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * _do_layer_norm(tensor, broadcast_mask, + num_elements_not_masked)) + return self.gamma * sum(pieces) + + +def add_sentence_boundary_token_ids( + tensor: torch.Tensor, mask: torch.Tensor, + sentence_begin_token: Any, sentence_end_token: Any) -> \ + Tuple[torch.Tensor, torch.Tensor]: + r"""Add begin/end of sentence tokens to the batch of sentences. + Given a batch of sentences with size `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)` this returns a tensor of shape + `(batch_size, timesteps + 2)` or `(batch_size, timesteps + 2, dim)` + respectively. + + Returns both the new tensor and updated mask. + + Args: + tensor: A tensor of shape `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)`. + mask: A tensor of shape `(batch_size, timesteps)`. + sentence_begin_token: For 2D input, a scalar with the id. + For 3D input, a tensor with length dim. + sentence_end_token: For 2D input, a scalar with the id. + For 3D input, a tensor with length dim. + + Returns: + tensor_with_boundary_tokens: The tensor with the appended and prepended + boundary tokens. If the input was 2D, it has shape + `(batch_size, timesteps + 2)` and if the input was 3D, it has shape + `(batch_size, timesteps + 2, dim)`. + new_mask: The new mask for the tensor, taking into account the appended + tokens marking the beginning and end of the sentence. + """ + sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() + tensor_shape = list(tensor.data.shape) + new_shape = list(tensor_shape) + new_shape[1] = tensor_shape[1] + 2 + tensor_with_boundary_tokens = tensor.new_zeros(*new_shape) + if len(tensor_shape) == 2: + tensor_with_boundary_tokens[:, 1:-1] = tensor + tensor_with_boundary_tokens[:, 0] = sentence_begin_token + for i, j in enumerate(sequence_lengths): + tensor_with_boundary_tokens[i, j + 1] = sentence_end_token + new_mask = (tensor_with_boundary_tokens != 0).long() + elif len(tensor_shape) == 3: + tensor_with_boundary_tokens[:, 1:-1, :] = tensor + for i, j in enumerate(sequence_lengths): + tensor_with_boundary_tokens[i, 0, :] = sentence_begin_token + tensor_with_boundary_tokens[i, j + 1, :] = sentence_end_token + new_mask = ( + (tensor_with_boundary_tokens > 0).long().sum(dim=-1) > 0).long() + else: + raise ValueError( + "add_sentence_boundary_token_ids only accepts 2D and 3D input") + return tensor_with_boundary_tokens, new_mask + + +def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor]: + r"""Remove begin/end of sentence embeddings from the batch of sentences. + Given a batch of sentences with size `(batch_size, timesteps, dim)` + this returns a tensor of shape `(batch_size, timesteps - 2, dim)` after + removing the beginning and end sentence markers. The sentences are + assumed to be padded on the right, with the beginning of each sentence + assumed to occur at index 0 (i.e., `mask[:, 0]` is assumed to be 1). + + Returns both the new tensor and updated mask. + + This function is the inverse of `add_sentence_boundary_token_ids`. + + Args: + tensor: A tensor of shape `(batch_size, timesteps, dim)`. + mask: A tensor of shape `(batch_size, timesteps)`. + + Returns: + tensor_without_boundary_tokens: The tensor after removing the boundary + tokens of shape `(batch_size, timesteps - 2, dim)`. + new_mask: The new mask for the tensor of shape + `(batch_size, timesteps - 2)`. + """ + sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() + tensor_shape = list(tensor.data.shape) + new_shape = list(tensor_shape) + new_shape[1] = tensor_shape[1] - 2 + tensor_without_boundary_tokens = tensor.new_zeros(*new_shape) + new_mask = tensor.new_zeros((new_shape[0], new_shape[1]), dtype=torch.long) + for i, j in enumerate(sequence_lengths): + if j > 2: + tensor_without_boundary_tokens[i, : (j - 2), :] = \ + tensor[i, 1: (j - 1), :] + new_mask[i, : (j - 2)] = 1 + return tensor_without_boundary_tokens, new_mask + + +def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], + gain: float = 1.0) -> None: + r"""An initializer which allows initializing model parameters in "blocks". + This is helpful in the case of recurrent models which use multiple gates + applied to linear projections, which can be computed efficiently if they + are concatenated together. However, they are separate parameters which + should be initialized independently. + + Args: + tensor: A tensor to initialize. + split_sizes: A list of length `tensor.ndim()` specifying the size of the + blocks along that particular dimension. E.g. `[10, 20]` would + result in the tensor being split into chunks of size 10 along the + first dimension and 20 along the second. + gain: The gain (scaling) applied to the orthogonal initialization. + """ + data = tensor.data + sizes = list(tensor.size()) + if any(a % b != 0 for a, b in zip(sizes, split_sizes)): + raise ValueError( + "tensor dimensions must be divisible by their respective " + "split_sizes. Found size: {} and split_sizes: {}".format( + sizes, split_sizes)) + indexes = [list(range(0, max_size, split)) for max_size, split in zip( + sizes, split_sizes)] + # Iterate over all possible blocks within the tensor. + for block_start_indices in itertools.product(*indexes): + # A list of tuples containing the index to start at for this block + # and the appropriate step size (i.e split_size[i] for dimension i). + index_and_step_tuples = zip(block_start_indices, split_sizes) + # This is a tuple of slices corresponding to: + # tensor[index: index + step_size, ...]. This is required because we + # could have an arbitrary number of dimensions. The actual slices we + # need are the start_index: start_index + step for each dimension in + # the tensor. + block_slice = tuple( + slice(start_index, start_index + step) for start_index, step in + index_and_step_tuples) + data[block_slice] = torch.nn.init.orthogonal_( + tensor[block_slice].contiguous(), gain=gain) + + +def get_dropout_mask(dropout_probability: float, + tensor_for_masking: torch.Tensor): + r"""Computes and returns an element-wise dropout mask for a given tensor, + where each element in the mask is dropped out with probability + dropout_probability. Note that the mask is NOT applied to the tensor - + the tensor is passed to retain the correct CUDA tensor type for the mask. + + Args: + dropout_probability: Probability of dropping a dimension of the input. + tensor_for_masking: torch.Tensor, required. + + Returns: + A torch.FloatTensor consisting of the binary mask scaled by + `1 / (1 - dropout_probability)`. This scaling ensures expected values + and variances of the output of applying this mask and the original + tensor are the same. + """ + binary_mask = ( + torch.rand(tensor_for_masking.size()) > dropout_probability).to( + tensor_for_masking.device) + # Scale mask by 1/keep_prob to preserve output statistics. + dropout_mask = binary_mask.float().div(1.0 - dropout_probability) + return dropout_mask diff --git a/texar/torch/modules/pretrained/elmo_utils_test.py b/texar/torch/modules/pretrained/elmo_utils_test.py new file mode 100644 index 000000000..46bf680f5 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_utils_test.py @@ -0,0 +1,753 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for utils of ELMo modules. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/encoder_base_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/lstm_cell_with_projection_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/highway_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/time_distributed_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/initializers_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/util_test.py` +""" + +import unittest + +import h5py +import json +import numpy +import tempfile +import torch + +from numpy.testing import assert_array_almost_equal, assert_almost_equal +from torch.nn import LSTM, RNN, Embedding, Module, Parameter + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids +from texar.torch.data.data_utils import maybe_download +from texar.torch.modules.pretrained.elmo_utils import ( + Highway, LstmCellWithProjection, _EncoderBase, _ElmoBiLm, TimeDistributed, + remove_sentence_boundaries, add_sentence_boundary_token_ids, + block_orthogonal, ScalarMix) +from texar.torch.utils.test import cuda_test +from texar.torch.utils.utils import sort_batch_by_length + + +class TestElmoBiLm(unittest.TestCase): + + def setUp(self): + super().setUp() + self.tmp_dir = tempfile.TemporaryDirectory() + self.options_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/options.json?raw=true', + self.tmp_dir.name) + self.weight_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/lm_weights.hdf5?raw=true', + self.tmp_dir.name) + self.sentences_json_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/sentences.json?raw=true', + self.tmp_dir.name) + + def tearDown(self): + self.tmp_dir.cleanup() + + def _load_sentences_embeddings(self): + r"""Load the test sentences and the expected LM embeddings. + + These files loaded in this method were created with a batch-size of 3. + Due to idiosyncrasies with TensorFlow, the 30 sentences in + sentences.json are split into 3 files in which the k-th sentence in + each is from batch k. + + This method returns a (sentences, embeddings) pair where each is a + list of length batch_size. Each list contains a sublist with + total_sentence_count / batch_size elements. As with the original files, + the k-th element in the sublist is in batch k. + """ + with open(self.sentences_json_file) as fin: + sentences = json.load(fin) + + # the expected embeddings + expected_lm_embeddings = [] + for k in range(len(sentences)): + embed_fname = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/' + 'tests/fixtures/elmo/lm_embeddings_{}.hdf5?raw=true'.format(k), + self.tmp_dir.name) + expected_lm_embeddings.append([]) + with h5py.File(embed_fname, "r") as fin: + for i in range(10): + sent_embeds = fin["%s" % i][...] + sent_embeds_concat = numpy.concatenate( + (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1) + expected_lm_embeddings[-1].append(sent_embeds_concat) + + return sentences, expected_lm_embeddings + + def test_elmo_bilm(self): + # get the raw data + sentences, expected_lm_embeddings = self._load_sentences_embeddings() + + # load the test model + elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) + + batches = [[sentences[j][i].split() for j in range(3)] + for i in range(10)] + + # Now finally we can iterate through batches. + for i, batch in enumerate(batches): + lm_embeddings = elmo_bilm(batch_to_ids(batch[:3])) + top_layer_embeddings, mask = remove_sentence_boundaries( + lm_embeddings["activations"][2], lm_embeddings["mask"]) + + # check the mask lengths + lengths = mask.data.numpy().sum(axis=1) + batch_sentences = [sentences[k][i] for k in range(3)] + expected_lengths = [len(sentence.split()) for sentence in + batch_sentences] + self.assertEqual(lengths.tolist(), expected_lengths) + + # get the expected embeddings and compare! + expected_top_layer = [expected_lm_embeddings[k][i] for k in + range(3)] + for k in range(3): + self.assertTrue( + numpy.allclose( + top_layer_embeddings[k, : lengths[k], :].data.numpy(), + expected_top_layer[k], + atol=1.0e-6,)) + + +class TestEncoderBase(unittest.TestCase): + + def setUp(self): + super().setUp() + self.lstm = LSTM( + bidirectional=True, num_layers=3, input_size=3, hidden_size=7, + batch_first=True) + self.rnn = RNN( + bidirectional=True, num_layers=3, input_size=3, hidden_size=7, + batch_first=True) + self.encoder_base = _EncoderBase(stateful=True) + + tensor = torch.rand([5, 7, 3]) + tensor[1, 6:, :] = 0 + tensor[3, 2:, :] = 0 + self.tensor = tensor + mask = torch.ones(5, 7) + mask[1, 6:] = 0 + mask[2, :] = 0 # <= completely masked + mask[3, 2:] = 0 + mask[4, :] = 0 # <= completely masked + self.mask = mask + + self.batch_size = 5 + self.num_valid = 3 + sequence_lengths = mask.long().sum(-1) + _, _, restoration_indices, sorting_indices = sort_batch_by_length( + tensor, sequence_lengths) + self.sorting_indices = sorting_indices + self.restoration_indices = restoration_indices + + def test_non_stateful_states_are_sorted_correctly(self): + encoder_base = _EncoderBase(stateful=False) + initial_states = (torch.randn(6, 5, 7), torch.randn(6, 5, 7)) + # Check that we sort the state for non-stateful encoders. To test + # we'll just use a "pass through" encoder, as we aren't actually testing + # the functionality of the encoder here anyway. + _, states, restoration_indices = encoder_base.sort_and_run_forward( + lambda *x: x, self.tensor, self.mask, initial_states) + # Our input tensor had 2 zero length sequences, so we need + # to concat a tensor of shape + # (num_layers * num_directions, batch_size - num_valid, hidden_dim), + # to the output before unsorting it. + zeros = torch.zeros([6, 2, 7]) + + # sort_and_run_forward strips fully-padded instances from the batch; + # in order to use the restoration_indices we need to add back the two + # that got stripped. What we get back should match what we started with. + for state, original in zip(states, initial_states): + assert list(state.size()) == [6, 3, 7] + state_with_zeros = torch.cat([state, zeros], 1) + unsorted_state = state_with_zeros.index_select(1, + restoration_indices) + for index in [0, 1, 3]: + numpy.testing.assert_array_equal( + unsorted_state[:, index, :].data.numpy(), + original[:, index, :].data.numpy()) + + def test_get_initial_states(self): + # First time we call it, there should be no state, so we should return + # None. + assert (self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices) is None) + + # First test the case that the previous state is _smaller_ than the + # current state input. + initial_states = (torch.randn([1, 3, 7]), torch.randn([1, 3, 7])) + self.encoder_base._states = initial_states + # sorting indices are: [0, 1, 3, 2, 4] + returned_states = self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices) + + correct_expanded_states = [torch.cat([state, torch.zeros([1, 2, 7])], 1) + for state in initial_states] + # State should have been expanded with zeros to have shape + # (1, batch_size, hidden_size). + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + correct_expanded_states[0].data.numpy()) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + correct_expanded_states[1].data.numpy()) + + # The returned states should be of shape (1, num_valid, hidden_size) and + # they also should have been sorted with respect to the indices. + # sorting indices are: [0, 1, 3, 2, 4] + + correct_returned_states = [ + state.index_select(1, self.sorting_indices)[:, : self.num_valid, :] + for state in correct_expanded_states] + + numpy.testing.assert_array_equal( + returned_states[0].data.numpy(), + correct_returned_states[0].data.numpy()) + numpy.testing.assert_array_equal( + returned_states[1].data.numpy(), + correct_returned_states[1].data.numpy()) + + # Now test the case that the previous state is larger: + original_states = (torch.randn([1, 10, 7]), torch.randn([1, 10, 7])) + self.encoder_base._states = original_states + # sorting indices are: [0, 1, 3, 2, 4] + returned_states = self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices) + # State should not have changed, as they were larger + # than the batch size of the requested states. + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + original_states[0].data.numpy()) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + original_states[1].data.numpy()) + + # The returned states should be of shape (1, num_valid, hidden_size) + # and they also should have been sorted with respect to the indices. + correct_returned_state = [ + x.index_select(1, self.sorting_indices)[:, : self.num_valid, :] + for x in original_states] + numpy.testing.assert_array_equal( + returned_states[0].data.numpy(), + correct_returned_state[0].data.numpy()) + numpy.testing.assert_array_equal( + returned_states[1].data.numpy(), + correct_returned_state[1].data.numpy()) + + def test_update_states(self): + assert self.encoder_base._states is None + initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + + index_selected_initial_states = ( + initial_states[0].index_select(1, self.restoration_indices), + initial_states[1].index_select(1, self.restoration_indices),) + + self.encoder_base._update_states(initial_states, + self.restoration_indices) + # State was None, so the updated state should just be the sorted given + # state. + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + index_selected_initial_states[0].data.numpy()) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + index_selected_initial_states[1].data.numpy()) + + new_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + # tensor has 2 completely masked rows, so the last 2 rows of the _ + # sorted_ states will be completely zero, having been appended after + # calling the respective encoder. + new_states[0][:, -2:, :] = 0 + new_states[1][:, -2:, :] = 0 + + index_selected_new_states = ( + new_states[0].index_select(1, self.restoration_indices), + new_states[1].index_select(1, self.restoration_indices),) + + self.encoder_base._update_states(new_states, self.restoration_indices) + # Check that the update _preserved_ the state for the rows which were + # completely masked (2 and 4): + for index in [2, 4]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_initial_states[0][:, index, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_initial_states[1][:, index, :].data.numpy(),) + # Now the states which were updated: + for index in [0, 1, 3]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_new_states[0][:, index, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_new_states[1][:, index, :].data.numpy(),) + + # Now test the case that the new state is smaller: + small_new_states = torch.randn([1, 3, 7]), torch.randn([1, 3, 7]) + # pretend the 2nd sequence in the batch was fully masked. + small_restoration_indices = torch.LongTensor([2, 0, 1]) + small_new_states[0][:, 0, :] = 0 + small_new_states[1][:, 0, :] = 0 + + index_selected_small_states = ( + small_new_states[0].index_select(1, small_restoration_indices), + small_new_states[1].index_select(1, small_restoration_indices),) + self.encoder_base._update_states(small_new_states, + small_restoration_indices) + + # Check the index for the row we didn't update is the same as the + # previous step: + for index in [1, 3]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_new_states[0][:, index, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_new_states[1][:, index, :].data.numpy(),) + # Indices we did update: + for index in [0, 2]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_small_states[0][:, index, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_small_states[1][:, index, :].data.numpy(),) + + # We didn't update index 4 in the previous step either, so it should + # be equal to the 4th index of initial states. + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, 4, :].data.numpy(), + index_selected_initial_states[0][:, 4, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, 4, :].data.numpy(), + index_selected_initial_states[1][:, 4, :].data.numpy(),) + + def test_reset_states(self): + # Initialize the encoder states. + assert self.encoder_base._states is None + initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + index_selected_initial_states = ( + initial_states[0].index_select(1, self.restoration_indices), + initial_states[1].index_select(1, self.restoration_indices),) + self.encoder_base._update_states(initial_states, + self.restoration_indices) + + # Check that only some of the states are reset when a mask is provided. + mask = torch.FloatTensor([1, 1, 0, 0, 0]) + self.encoder_base.reset_states(mask) + # First two states should be zeros + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, :2, :].data.numpy(), + torch.zeros_like(initial_states[0])[:, :2, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, :2, :].data.numpy(), + torch.zeros_like(initial_states[1])[:, :2, :].data.numpy(),) + # Remaining states should be the same + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, 2:, :].data.numpy(), + index_selected_initial_states[0][:, 2:, :].data.numpy(),) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, 2:, :].data.numpy(), + index_selected_initial_states[1][:, 2:, :].data.numpy(),) + + # Check that error is raised if mask has wrong batch size. + bad_mask = torch.FloatTensor([1, 1, 0]) + with self.assertRaises(ValueError): + self.encoder_base.reset_states(bad_mask) + + # Check that states are reset to None if no mask is provided. + self.encoder_base.reset_states() + assert self.encoder_base._states is None + + def test_non_contiguous_initial_states_handled(self): + # Check that the encoder is robust to non-contiguous initial states. + + # Case 1: Encoder is not stateful + + # A transposition will make the tensors non-contiguous, start them off + # at the wrong shape and transpose them into the right shape. + encoder_base = _EncoderBase(stateful=False) + initial_states = (torch.randn(5, 6, 7).permute(1, 0, 2), + torch.randn(5, 6, 7).permute(1, 0, 2),) + assert not initial_states[0].is_contiguous() and \ + not initial_states[1].is_contiguous() + assert initial_states[0].size() == torch.Size([6, 5, 7]) + assert initial_states[1].size() == torch.Size([6, 5, 7]) + + # We'll pass them through an LSTM encoder and a vanilla RNN encoder to + # make sure it works whether the initial states are a tuple of tensors + # or just a single tensor. + encoder_base.sort_and_run_forward(self.lstm, self.tensor, + self.mask, initial_states) + encoder_base.sort_and_run_forward(self.rnn, self.tensor, + self.mask, initial_states[0]) + + # Case 2: Encoder is stateful + + # For stateful encoders, the initial state may be non-contiguous if + # its state was previously updated with non-contiguous tensors. As in + # the non-stateful tests, we check that the encoder still works on + # initial states for RNNs and LSTMs. + final_states = initial_states + # Check LSTM + encoder_base = _EncoderBase(stateful=True) + encoder_base._update_states(final_states, self.restoration_indices) + encoder_base.sort_and_run_forward(self.lstm, self.tensor, self.mask) + # Check RNN + encoder_base.reset_states() + encoder_base._update_states([final_states[0]], self.restoration_indices) + encoder_base.sort_and_run_forward(self.rnn, self.tensor, self.mask) + + @cuda_test + def test_non_contiguous_initial_states_handled_on_gpu(self): + # Some PyTorch operations which produce contiguous tensors on the CPU + # produce non-contiguous tensors on the GPU (e.g. forward pass of an + # RNN when batch_first=True). Accordingly, we perform the same checks + # from previous test on the GPU to ensure the encoder is not affected + # by which device it is on. + + # Case 1: Encoder is not stateful + + # A transposition will make the tensors non-contiguous, start them off + # at the wrong shape and transpose them into the right shape. + encoder_base = _EncoderBase(stateful=False).cuda() + initial_states = (torch.randn(5, 6, 7).cuda().permute(1, 0, 2), + torch.randn(5, 6, 7).cuda().permute(1, 0, 2),) + assert not initial_states[0].is_contiguous() and not initial_states[ + 1].is_contiguous() + assert initial_states[0].size() == torch.Size([6, 5, 7]) + assert initial_states[1].size() == torch.Size([6, 5, 7]) + + # We'll pass them through an LSTM encoder and a vanilla RNN encoder to + # make sure it works whether the initial states are a tuple of tensors + # or just a single tensor. + encoder_base.sort_and_run_forward( + self.lstm.cuda(), self.tensor.cuda(), self.mask.cuda(), + initial_states) + encoder_base.sort_and_run_forward( + self.rnn.cuda(), self.tensor.cuda(), self.mask.cuda(), + initial_states[0]) + + # Case 2: Encoder is stateful + + # For stateful encoders, the initial state may be non-contiguous if its + # state was previously updated with non-contiguous tensors. As in the + # non-stateful tests, we check that the encoder still works on initial + # states for RNNs and LSTMs. + final_states = initial_states + # Check LSTM + encoder_base = _EncoderBase(stateful=True).cuda() + encoder_base._update_states(final_states, + self.restoration_indices.cuda()) + encoder_base.sort_and_run_forward(self.lstm.cuda(), self.tensor.cuda(), + self.mask.cuda()) + # Check RNN + encoder_base.reset_states() + encoder_base._update_states([final_states[0]], + self.restoration_indices.cuda()) + encoder_base.sort_and_run_forward(self.rnn.cuda(), self.tensor.cuda(), + self.mask.cuda()) + + +class TestHighway(unittest.TestCase): + + def test_forward_works_on_simple_input(self): + highway = Highway(2, 2) + + highway._layers[0].weight.data.fill_(1) + highway._layers[0].bias.data.fill_(0) + highway._layers[1].weight.data.fill_(2) + highway._layers[1].bias.data.fill_(-2) + input_tensor = torch.FloatTensor([[-2, 1], [3, -2]]) + result = highway(input_tensor).data.numpy() + assert result.shape == (2, 2) + # This was checked by hand. + assert_almost_equal(result, [[-0.0394, 0.0197], [1.7527, -0.5550]], + decimal=4) + + def test_forward_works_on_nd_input(self): + highway = Highway(2, 2) + input_tensor = torch.ones(2, 2, 2) + output = highway(input_tensor) + assert output.size() == (2, 2, 2) + + +class TestLstmCellWithProjection(unittest.TestCase): + + def test_elmo_lstm_cell_completes_forward_pass(self): + input_tensor = torch.rand(4, 5, 3) + input_tensor[1, 4:, :] = 0.0 + input_tensor[2, 2:, :] = 0.0 + input_tensor[3, 1:, :] = 0.0 + + initial_hidden_state = torch.ones([1, 4, 5]) + initial_memory_state = torch.ones([1, 4, 7]) + + lstm = LstmCellWithProjection( + input_size=3, + hidden_size=5, + cell_size=7, + memory_cell_clip_value=2, + state_projection_clip_value=1,) + output_sequence, lstm_state = lstm( + input_tensor, [5, 4, 2, 1], (initial_hidden_state, + initial_memory_state)) + numpy.testing.assert_array_equal( + output_sequence.data[1, 4:, :].numpy(), 0.0) + numpy.testing.assert_array_equal( + output_sequence.data[2, 2:, :].numpy(), 0.0) + numpy.testing.assert_array_equal( + output_sequence.data[3, 1:, :].numpy(), 0.0) + + # Test the state clipping. + numpy.testing.assert_array_less(output_sequence.data.numpy(), 1.0) + numpy.testing.assert_array_less(-output_sequence.data.numpy(), 1.0) + + # LSTM state should be (num_layers, batch_size, hidden_size) + assert list(lstm_state[0].size()) == [1, 4, 5] + # LSTM memory cell should be (num_layers, batch_size, cell_size) + assert list((lstm_state[1].size())) == [1, 4, 7] + + # Test the cell clipping. + numpy.testing.assert_array_less(lstm_state[0].data.numpy(), 2.0) + numpy.testing.assert_array_less(-lstm_state[0].data.numpy(), 2.0) + + +class TestTimeDistributed(unittest.TestCase): + + def test_time_distributed_reshapes_named_arg_correctly(self): + char_embedding = Embedding(2, 2) + char_embedding.weight = Parameter( + torch.FloatTensor([[0.4, 0.4], [0.5, 0.5]])) + distributed_embedding = TimeDistributed(char_embedding) + char_input = torch.LongTensor([[[1, 0], [1, 1]]]) + output = distributed_embedding(char_input) + assert_almost_equal( + output.data.numpy(), + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]]) + + def test_time_distributed_reshapes_positional_kwarg_correctly(self): + char_embedding = Embedding(2, 2) + char_embedding.weight = Parameter(torch.FloatTensor( + [[0.4, 0.4], [0.5, 0.5]])) + distributed_embedding = TimeDistributed(char_embedding) + char_input = torch.LongTensor([[[1, 0], [1, 1]]]) + output = distributed_embedding(input=char_input) + assert_almost_equal( + output.data.numpy(), + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]]) + + def test_time_distributed_works_with_multiple_inputs(self): + module = lambda x, y: x + y + distributed = TimeDistributed(module) + x_input = torch.LongTensor([[[1, 2], [3, 4]]]) + y_input = torch.LongTensor([[[4, 2], [9, 1]]]) + output = distributed(x_input, y_input) + assert_almost_equal(output.data.numpy(), [[[5, 4], [12, 5]]]) + + def test_time_distributed_reshapes_multiple_inputs_with_pass_through_tensor_correctly(self): + + class FakeModule(Module): + def forward(self, input_tensor, tensor_to_pass_through=None, + another_tensor=None): + return input_tensor + tensor_to_pass_through + another_tensor + + module = FakeModule() + distributed_module = TimeDistributed(module) + + input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]]) + input_to_pass_through = torch.LongTensor([3, 7]) + input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]]) + + output = distributed_module( + input_tensor1, + tensor_to_pass_through=input_to_pass_through, + another_tensor=input_tensor2, + pass_through=["tensor_to_pass_through"],) + assert_almost_equal(output.data.numpy(), [[[8, 11], [15, 12]]]) + + def test_time_distributed_reshapes_multiple_inputs_with_pass_through_non_tensor_correctly(self): + + class FakeModule(Module): + + def forward(self, input_tensor, number=0, another_tensor=None): + + return input_tensor + number + another_tensor + + module = FakeModule() + distributed_module = TimeDistributed(module) + + input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]]) + input_number = 5 + input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]]) + + output = distributed_module( + input_tensor1, + number=input_number, + another_tensor=input_tensor2, + pass_through=["number"],) + assert_almost_equal(output.data.numpy(), [[[10, 9], [17, 10]]]) + + +class TestUtils(unittest.TestCase): + + def test_add_sentence_boundary_token_ids_handles_2D_input(self): + tensor = torch.from_numpy(numpy.array([[1, 2, 3], [4, 5, 0]])) + mask = (tensor > 0).long() + bos = 9 + eos = 10 + new_tensor, new_mask = add_sentence_boundary_token_ids( + tensor, mask, bos, eos) + expected_new_tensor = numpy.array([[9, 1, 2, 3, 10], [9, 4, 5, 10, 0]]) + assert (new_tensor.data.numpy() == expected_new_tensor).all() + assert (new_mask.data.numpy() == (expected_new_tensor > 0)).all() + + def test_add_sentence_boundary_token_ids_handles_3D_input(self): + tensor = torch.from_numpy( + numpy.array([[[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]], + [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]]])) + mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor) + bos = torch.from_numpy(numpy.array([9, 9, 9, 9])) + eos = torch.from_numpy(numpy.array([10, 10, 10, 10])) + new_tensor, new_mask = add_sentence_boundary_token_ids( + tensor, mask, bos, eos) + expected_new_tensor = numpy.array( + [[[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2], + [10, 10, 10, 10]], + [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10], + [0, 0, 0, 0]]]) + assert (new_tensor.data.numpy() == expected_new_tensor).all() + assert (new_mask.data.numpy() == ( + (expected_new_tensor > 0).sum(axis=-1) > 0)).all() + + def test_remove_sentence_boundaries(self): + tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) + mask = torch.from_numpy( + # The mask with two elements is to test the corner case + # of an empty sequence, so here we are removing boundaries + # from " " + numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) + ).long() + new_tensor, new_mask = remove_sentence_boundaries(tensor, mask) + + expected_new_tensor = torch.zeros(3, 3, 7) + expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] + expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] + assert_array_almost_equal(new_tensor.data.numpy(), + expected_new_tensor.data.numpy()) + + expected_new_mask = torch.from_numpy(numpy.array( + [[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() + assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() + + def test_block_orthogonal_can_initialize(self): + tensor = torch.zeros([10, 6]) + block_orthogonal(tensor, [5, 3]) + tensor = tensor.data.numpy() + + def test_block_is_orthogonal(block) -> None: + matrix_product = block.T @ block + numpy.testing.assert_array_almost_equal( + matrix_product, numpy.eye(matrix_product.shape[-1]), 6) + + test_block_is_orthogonal(tensor[:5, :3]) + test_block_is_orthogonal(tensor[:5, 3:]) + test_block_is_orthogonal(tensor[5:, 3:]) + test_block_is_orthogonal(tensor[5:, :3]) + + def test_block_orthogonal_raises_on_mismatching_dimensions(self): + tensor = torch.zeros([10, 6, 8]) + with self.assertRaises(ValueError): + block_orthogonal(tensor, [7, 2, 1]) + + +class TestScalarMix(unittest.TestCase): + + def test_scalar_mix_can_run_forward(self): + mixture = ScalarMix(3) + tensors = [torch.randn([3, 4, 5]) for _ in range(3)] + for k in range(3): + mixture.scalar_parameters[k].data[0] = 0.1 * (k + 1) + mixture.gamma.data[0] = 0.5 + result = mixture(tensors) + + weights = [0.1, 0.2, 0.3] + normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights)) + expected_result = sum(normed_weights[k] * tensors[k].data.numpy() + for k in range(3)) + expected_result *= 0.5 + numpy.testing.assert_almost_equal(expected_result, result.data.numpy()) + + def test_scalar_mix_throws_error_on_incorrect_number_of_inputs(self): + mixture = ScalarMix(3) + tensors = [torch.randn([3, 4, 5]) for _ in range(5)] + with self.assertRaises(ValueError): + _ = mixture(tensors) + + def test_scalar_mix_throws_error_on_incorrect_initial_scalar_parameters_length(self): + with self.assertRaises(ValueError): + ScalarMix(3, initial_scalar_parameters=[0.0, 0.0]) + + def test_scalar_mix_trainable_with_initial_scalar_parameters(self): + initial_scalar_parameters = [1.0, 2.0, 3.0] + mixture = ScalarMix(3, + initial_scalar_parameters=initial_scalar_parameters, + trainable=False) + for i, scalar_mix_parameter in enumerate(mixture.scalar_parameters): + assert scalar_mix_parameter.requires_grad is False + assert scalar_mix_parameter.item() == initial_scalar_parameters[i] + + def test_scalar_mix_layer_norm(self): + mixture = ScalarMix(3, do_layer_norm="scalar_norm_reg") + + tensors = [torch.randn([3, 4, 5]) for _ in range(3)] + numpy_mask = numpy.ones((3, 4), dtype="int32") + numpy_mask[1, 2:] = 0 + mask = torch.from_numpy(numpy_mask) + + weights = [0.1, 0.2, 0.3] + for k in range(3): + mixture.scalar_parameters[k].data[0] = weights[k] + mixture.gamma.data[0] = 0.5 + result = mixture(tensors, mask) + + normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights)) + expected_result = numpy.zeros((3, 4, 5)) + for k in range(3): + mean = numpy.mean(tensors[k].data.numpy()[numpy_mask == 1]) + std = numpy.std(tensors[k].data.numpy()[numpy_mask == 1]) + normed_tensor = (tensors[k].data.numpy() - mean) / (std + 1e-12) + expected_result += normed_tensor * normed_weights[k] + expected_result *= 0.5 + + numpy.testing.assert_almost_equal(expected_result, result.data.numpy(), + decimal=6) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/utils/test.py b/texar/torch/utils/test.py index 26bdfe10e..50a28eb27 100644 --- a/texar/torch/utils/test.py +++ b/texar/torch/utils/test.py @@ -21,6 +21,7 @@ __all__ = [ "pretrained_test", "data_test", + "cuda_test", "external_library_test", ] @@ -35,6 +36,8 @@ def define_skip_condition(flag: str, explanation: str): 'TEST_PRETRAINED', "Test requires loading pre-trained checkpoints.") data_test = define_skip_condition( 'TEST_DATA', "Test requires loading large data files.") +cuda_test = define_skip_condition( + 'TEST_CUDA', "Test requires cuda.") def external_library_test(name: str): diff --git a/texar/torch/utils/utils.py b/texar/torch/utils/utils.py index 426081587..bb71e76c8 100644 --- a/texar/torch/utils/utils.py +++ b/texar/torch/utils/utils.py @@ -19,10 +19,12 @@ import copy import inspect from functools import lru_cache +from itertools import islice from pydoc import locate from typing import ( - Any, Callable, Collection, Dict, List, MutableMapping, Optional, Sequence, - Tuple, Type, TypeVar, Union, cast, no_type_check, overload) + Any, Callable, Collection, Dict, Iterable, Iterator, List, MutableMapping, + Optional, Sequence, Tuple, Type, TypeVar, Union, cast, no_type_check, + overload) import funcsigs import numpy as np @@ -67,6 +69,11 @@ 'uniquify_str', 'ceildiv', 'sum_tensors', + 'lazy_groups_of', + 'sort_batch_by_length', + 'get_device_of', + 'combine_initial_dims', + 'uncombine_initial_dims', ] T = TypeVar('T') # type argument @@ -1196,3 +1203,103 @@ def truncate_seq_pair(tokens_a: Union[List[int], List[str]], tokens_a.pop() else: tokens_b.pop() + + +A = TypeVar("A") + + +def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: + r"""Takes an iterable and batches the individual instances into lists of the + specified size. The last list may be smaller if there are instances left + over. + + Args: + iterable: An iterable object. + group_size: The group size. + + Returns: + An iterator. + """ + iterator = iter(iterable) + while True: + s = list(islice(iterator, group_size)) + if len(s) > 0: + yield s + else: + break + + +def sort_batch_by_length(tensor: torch.Tensor, + sequence_lengths: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Sort a batch first tensor by some specified lengths. + + Args: + tensor: A batch first tensor. + sequence_lengths: A tensor representing the lengths of some dimension of + the tensor which we want to sort by. + + Returns: + sorted_tensor: The original tensor sorted along the batch dimension + with respect to `sequence_lengths`. + sorted_sequence_lengths: The original `sequence_lengths` sorted by + decreasing size. + restoration_indices: Indices into the `sorted_tensor` such that + ``sorted_tensor.index_select(0, restoration_indices) == + original_tensor`` + permutation_index: The indices used to sort the tensor. This is useful + if you want to sort many tensors using the same ordering. + """ + if not isinstance(tensor, torch.Tensor) or \ + not isinstance(sequence_lengths, torch.Tensor): + raise ValueError( + "Both the tensor and sequence lengths must be torch.Tensors.") + + sorted_sequence_lengths, permutation_index = sequence_lengths.sort( + 0, descending=True) + sorted_tensor = tensor.index_select(0, permutation_index) + + index_range = torch.arange(0, len(sequence_lengths), + device=sequence_lengths.device) + # This is the equivalent of zipping with index, sorting by the original + # sequence lengths and returning the now sorted indices. + _, reverse_mapping = permutation_index.sort(0, descending=False) + restoration_indices = index_range.index_select(0, reverse_mapping) + return (sorted_tensor, sorted_sequence_lengths, restoration_indices, + permutation_index) + + +def get_device_of(tensor: torch.Tensor) -> int: + r"""Returns the device of the tensor. + """ + if not tensor.is_cuda: + return -1 + else: + return tensor.get_device() + + +def combine_initial_dims(tensor: torch.Tensor) -> torch.Tensor: + r"""Given a (possibly higher order) tensor with shape + `[d1, ..., dn, sequence_length]` Return a view that's + `[d1 * ... * dn, sequence_length]`. If original tensor is 1-d or 2-d, + return it as is. + """ + if tensor.dim() <= 2: + return tensor + else: + return tensor.view(-1, tensor.size(-1)) + + +def uncombine_initial_dims(tensor: torch.Tensor, + original_size: torch.Size) -> torch.Tensor: + r"""Given a tensor of embeddings with shape + `[d1 * ... * dn, sequence_length, embedding_dim]` and the original shape + `[d1, ..., dn, sequence_length]`, return the reshaped tensor of embeddings + with shape `[d1, ..., dn, sequence_length, embedding_dim]`. + If original size is 1-d or 2-d, return it as is. + """ + if len(original_size) <= 2: + return tensor + else: + view_args = list(original_size) + [tensor.size(-1)] + return tensor.view(*view_args) diff --git a/texar/torch/utils/utils_test.py b/texar/torch/utils/utils_test.py index 2eb543a77..190d95e62 100644 --- a/texar/torch/utils/utils_test.py +++ b/texar/torch/utils/utils_test.py @@ -195,25 +195,50 @@ def test_truncate_seq_pair(self): self.assertListEqual(tokens_a, [1]) self.assertListEqual(tokens_b, [2, 3]) - # def test_map_ids_to_strs(self): - # """Tests :func:`texar.torch.utils.map_ids_to_strs`. - # """ - # vocab_list = ['word', '词'] - # vocab_file = tempfile.NamedTemporaryFile() - # vocab_file.write('\n'.join(vocab_list).encode("utf-8")) - # vocab_file.flush() - # vocab = Vocab(vocab_file.name) - - # text = [['', 'word', '词', '', ''], - # ['word', '词', 'word', '词', '']] - # text = np.asarray(text) - # ids = vocab.map_tokens_to_ids_py(text) - - # ids = ids.tolist() - # text_ = utils.map_ids_to_strs(ids, vocab) - - # self.assertEqual(text_[0], 'word 词') - # self.assertEqual(text_[1], 'word 词 word 词') + def test_lazy_groups_of(self): + xs = [1, 2, 3, 4, 5, 6, 7] + groups = utils.lazy_groups_of(iter(xs), group_size=3) + assert next(groups) == [1, 2, 3] + assert next(groups) == [4, 5, 6] + assert next(groups) == [7] + with self.assertRaises(StopIteration): + _ = next(groups) + + def test_sort_batch_by_length(self): + tensor = torch.rand([5, 7, 9]) + tensor[0, 3:, :] = 0 + tensor[1, 4:, :] = 0 + tensor[2, 1:, :] = 0 + tensor[3, 5:, :] = 0 + + sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) + sorted_tensor, sorted_lengths, reverse_indices, _ = \ + utils.sort_batch_by_length(tensor, sequence_lengths) + + # Test sorted indices are padded correctly. + np.testing.assert_array_equal(sorted_tensor[1, 5:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[2, 4:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[3, 3:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[4, 1:, :].data.numpy(), 0.0) + + assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) + + # Test restoration indices correctly recover the original tensor. + assert sorted_tensor.index_select(0, reverse_indices).data.equal( + tensor.data) + + def test_combine_initial_dims(self): + tensor = torch.randn(4, 10, 20, 17, 5) + + tensor2d = utils.combine_initial_dims(tensor) + assert list(tensor2d.size()) == [4 * 10 * 20 * 17, 5] + + def test_uncombine_initial_dims(self): + embedding2d = torch.randn(4 * 10 * 20 * 17 * 5, 12) + + embedding = utils.uncombine_initial_dims(embedding2d, + torch.Size((4, 10, 20, 17, 5))) + assert list(embedding.size()) == [4, 10, 20, 17, 5, 12] if __name__ == "__main__":