diff --git a/docs/code/modules.rst b/docs/code/modules.rst
index a1a8b1392..eada424b4 100644
--- a/docs/code/modules.rst
+++ b/docs/code/modules.rst
@@ -64,6 +64,11 @@ Encoders
 .. autoclass:: texar.torch.modules.BERTEncoder
     :members:
 
+:hidden:`ELMoEncoder`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.modules.ELMoEncoder
+    :members:
+
 :hidden:`RoBERTaEncoder`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: texar.torch.modules.RoBERTaEncoder
@@ -283,6 +288,11 @@ Pre-trained
 .. autoclass:: texar.torch.modules.PretrainedBERTMixin
     :members:
 
+:hidden:`PretrainedELMoMixin`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.modules.PretrainedELMoMixin
+    :members:
+
 :hidden:`PretrainedRoBERTaMixin`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: texar.torch.modules.PretrainedRoBERTaMixin
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
index 38a5a84af..3b2871bd2 100644
--- a/docs/spelling_wordlist.txt
+++ b/docs/spelling_wordlist.txt
@@ -72,3 +72,4 @@ tokenizer
 wordpiece
 unigram
 TF
+convnet
diff --git a/requirements.txt b/requirements.txt
index efdba2f84..22f7f8cdd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ numpy >= 1.15.4
 mypy_extensions >= 0.4.1
 regex >= 2018.01.10
 sentencepiece >= 0.1.8
+h5py >= 2.10.0
diff --git a/setup.py b/setup.py
index 85fef9cb4..3b86d933c 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@
     install_requires=[
         'regex>=2018.01.10',
         'numpy',
+        'h5py>=2.10.0',
         'requests',
         'funcsigs',
         'sentencepiece>=0.1.8',
diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py
new file mode 100644
index 000000000..9f51168a0
--- /dev/null
+++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py
@@ -0,0 +1,131 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utils of pre-trained ELMo tokenizer.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/elmo_indexer.py`
+"""
+from typing import Dict, List, Optional
+
+import torch
+
+from torch.nn.utils.rnn import pad_sequence
+
+
+__all__ = [
+    "ELMoCharacterMapper",
+    "batch_to_ids",
+]
+
+
+def _make_bos_eos(character: int,
+                  padding_character: int,
+                  beginning_of_word_character: int,
+                  end_of_word_character: int,
+                  max_word_length: int):
+    char_ids = [padding_character] * max_word_length
+    char_ids[0] = beginning_of_word_character
+    char_ids[1] = character
+    char_ids[2] = end_of_word_character
+    return char_ids
+
+
+class ELMoCharacterMapper:
+    r"""Maps individual tokens to sequences of character ids, compatible with
+    ELMo. To be consistent with previously trained models, we include it here as
+    special of existing character indexers.
+
+    We allow to add optional additional special tokens with designated
+    character ids with `tokens_to_add`.
+    """
+
+    max_word_length = 50
+
+    # char ids 0-255 come from utf-8 encoding bytes
+    # assign 256-300 to special chars
+    beginning_of_sentence_character = 256  # <begin sentence>
+    end_of_sentence_character = 257  # <end sentence>
+    beginning_of_word_character = 258  # <begin word>
+    end_of_word_character = 259  # <end word>
+    padding_character = 260  # <padding>
+
+    beginning_of_sentence_characters = _make_bos_eos(
+        beginning_of_sentence_character,
+        padding_character,
+        beginning_of_word_character,
+        end_of_word_character,
+        max_word_length,
+    )
+    end_of_sentence_characters = _make_bos_eos(
+        end_of_sentence_character,
+        padding_character,
+        beginning_of_word_character,
+        end_of_word_character,
+        max_word_length,
+    )
+
+    bos_token = "<S>"
+    eos_token = "</S>"
+
+    def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None:
+        self.tokens_to_add = tokens_to_add or {}
+
+    def convert_word_to_char_ids(self, word: str) -> List[int]:
+        if word in self.tokens_to_add:
+            char_ids = [self.padding_character] * self.max_word_length
+            char_ids[0] = self.beginning_of_word_character
+            char_ids[1] = self.tokens_to_add[word]
+            char_ids[2] = self.end_of_word_character
+        elif word == self.bos_token:
+            char_ids = self.beginning_of_sentence_characters
+        elif word == self.eos_token:
+            char_ids = self.end_of_sentence_characters
+        else:
+            word_encoded = word.encode("utf-8", "ignore")[: (
+                    self.max_word_length - 2)]
+            char_ids = [self.padding_character] * self.max_word_length
+            char_ids[0] = self.beginning_of_word_character
+            for k, chr_id in enumerate(word_encoded, start=1):
+                char_ids[k] = chr_id
+            char_ids[len(word_encoded) + 1] = self.end_of_word_character
+
+        # +1 one for masking
+        return [c + 1 for c in char_ids]
+
+    def __eq__(self, other) -> bool:
+        if isinstance(self, other.__class__):
+            return self.__dict__ == other.__dict__
+        return NotImplemented
+
+
+def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
+    r"""Converts a batch of tokenized sentences to a tensor representing the
+    sentences with encoded characters (len(batch), max sentence length,
+    max word length).
+
+    Args:
+        batch: A list of tokenized sentences.
+
+    Returns:
+        A tensor of padded character ids.
+    """
+    res = []
+    mapper = ELMoCharacterMapper()
+    for sentence in batch:
+        character_ids = [mapper.convert_word_to_char_ids(token)
+                         for token in sentence]
+        res.append(torch.tensor(character_ids))
+
+    return pad_sequence(res, batch_first=True)
diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py
new file mode 100644
index 000000000..32e2c7a24
--- /dev/null
+++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py
@@ -0,0 +1,72 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the utils of pre-trained ELMo tokenizer.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py`
+"""
+
+import unittest
+
+from texar.torch.data.tokenizers.elmo_tokenizer_utils import (
+    ELMoCharacterMapper, batch_to_ids)
+
+
+class ELMoTokenizerUtilsTest(unittest.TestCase):
+
+    def test_bos_to_char_ids(self):
+        mapper = ELMoCharacterMapper()
+        indices = mapper.convert_word_to_char_ids('<S>')
+        # [<begin word>, <begin sentence>, <end word>, <padding>, ... <padding>]
+        expected_indices = [259, 257, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_eos_to_char_ids(self):
+        mapper = ELMoCharacterMapper()
+        indices = mapper.convert_word_to_char_ids('</S>')
+        expected_indices = [259, 258, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_unicode_to_char_ids(self):
+        mapper = ELMoCharacterMapper()
+        indices = mapper.convert_word_to_char_ids(chr(256) + "t")
+        expected_indices = [259, 197, 129, 117, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_additional_tokens(self):
+        mapper = ELMoCharacterMapper(tokens_to_add={"<first>": 1})
+        indices = mapper.convert_word_to_char_ids("<first>")
+        expected_indices = [259, 2, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_batch_to_ids(self):
+        sentences = [['First', 'sentence', '.'], ['Another', '.']]
+        indices = batch_to_ids(sentences)
+        expected_indices = [[
+            [259,  71, 106, 115, 116, 117, 260] + [261] * 43,
+            [259, 116, 102, 111, 117, 102, 111, 100, 102, 260] + [261] * 40,
+            [259,  47, 260] + [261] * 47], [
+            [259, 66, 111, 112, 117, 105, 102, 115, 260] + [261] * 41,
+            [259, 47, 260] + [261] * 47,
+            [0] * 50]]
+        self.assertEqual(indices.tolist(), expected_indices)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/texar/torch/modules/encoders/__init__.py b/texar/torch/modules/encoders/__init__.py
index ce69fd985..1031dab5c 100644
--- a/texar/torch/modules/encoders/__init__.py
+++ b/texar/torch/modules/encoders/__init__.py
@@ -17,6 +17,7 @@
 
 from texar.torch.modules.encoders.bert_encoder import *
 from texar.torch.modules.encoders.conv_encoders import *
+from texar.torch.modules.encoders.elmo_encoder import *
 from texar.torch.modules.encoders.encoder_base import *
 from texar.torch.modules.encoders.gpt2_encoder import *
 from texar.torch.modules.encoders.multihead_attention import *
diff --git a/texar/torch/modules/encoders/elmo_encoder.py b/texar/torch/modules/encoders/elmo_encoder.py
new file mode 100644
index 000000000..98c05f8a1
--- /dev/null
+++ b/texar/torch/modules/encoders/elmo_encoder.py
@@ -0,0 +1,318 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ELMo encoder.
+"""
+import json
+import os
+import tempfile
+import warnings
+
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+
+from torch.nn.modules import Dropout
+
+from texar.torch.modules.encoders.encoder_base import EncoderBase
+from texar.torch.modules.pretrained.elmo import PretrainedELMoMixin
+from texar.torch.modules.pretrained.elmo_utils import (
+    _ElmoBiLm, ScalarMix, remove_sentence_boundaries)
+
+__all__ = [
+    "ELMoEncoder",
+]
+
+
+class ELMoEncoder(EncoderBase, PretrainedELMoMixin):
+    r"""ELMo model for encoding sequences. Please see
+    :class:`~texar.torch.modules.PretrainedELMoMixin` for a brief description
+    of ELMo.
+
+    Args:
+        pretrained_model_name (optional): a `str`, the name
+            of pre-trained model (e.g., ``elmo-small``). Please refer to
+            :class:`~texar.torch.modules.PretrainedELMoMixin` for
+            all supported models.
+            If `None`, the model name in :attr:`hparams` is used.
+        cache_dir (optional): the path to a folder in which the
+            pre-trained models will be cached. If `None` (default),
+            a default directory (``texar_data`` folder under user's home
+            directory) will be used.
+        hparams (dict or HParams, optional): Hyperparameters. Missing
+            hyperparameter will be set to default values. See
+            :meth:`default_hparams` for the hyperparameter structure
+            and default values.
+    """
+    def __init__(self,
+                 pretrained_model_name: Optional[str] = None,
+                 cache_dir: Optional[str] = None,
+                 hparams=None):
+        super().__init__(hparams=hparams)
+
+        self.load_pretrained_config(pretrained_model_name, cache_dir)
+
+        options_file = None
+        weight_file = None
+        tmp_dir = tempfile.TemporaryDirectory()
+        if self.pretrained_model_dir is not None:
+            info = list(os.walk(self.pretrained_model_dir))
+            root, _, files = info[0]
+            for file in files:
+                if file.endswith('options.json'):
+                    options_file = os.path.join(root, file)
+                if file.endswith('weights.hdf5'):
+                    weight_file = os.path.join(root, file)
+        else:
+            with open(os.path.join(tmp_dir.name, 'options.json'), "w") as fp:
+                json.dump(self.hparams.encoder.todict(), fp)
+            options_file = os.path.join(tmp_dir.name, 'options.json')
+
+        assert options_file is not None
+        self._elmo_lstm = _ElmoBiLm(
+            options_file, weight_file,
+            requires_grad=self.hparams.requires_grad,
+            vocab_to_cache=self.hparams.vocab_to_cache)
+        tmp_dir.cleanup()
+
+        self._has_cached_vocab = self.hparams.vocab_to_cache is not None
+        self._keep_sentence_boundaries = self.hparams.keep_sentence_boundaries
+        self._dropout = Dropout(p=self.hparams.dropout)
+        self._scalar_mixes: Any = []
+        for k in range(self.hparams.num_output_representations):
+            scalar_mix = ScalarMix(
+                self._elmo_lstm.num_layers,
+                do_layer_norm=self.hparams.do_layer_norm,
+                initial_scalar_parameters=self.hparams.scalar_mix_parameters,
+                trainable=self.hparams.scalar_mix_parameters is None)
+            self.add_module("scalar_mix_{}".format(k), scalar_mix)
+            self._scalar_mixes.append(scalar_mix)
+
+    @staticmethod
+    def default_hparams():
+        r"""Returns a dictionary of hyperparameters with default values.
+
+        * The encoder arch is determined by the constructor argument
+          :attr:`pretrained_model_name` if it's specified. In this case,
+          `hparams` are ignored.
+        * Otherwise, the encoder arch is determined by
+          `hparams['pretrained_model_name']` if it's specified. All other
+          configurations in `hparams` are ignored.
+        * If the above two are `None`, the encoder arch is defined by the
+          configurations in `hparams` and weights are randomly initialized.
+
+        .. code-block:: python
+
+            {
+                "pretrained_model_name": "elmo-small",
+                "encoder": {
+                    "lstm": {
+                        "use_skip_connections": True,
+                        "projection_dim": 128,
+                        "cell_clip": 3,
+                        "proj_clip": 3,
+                        "dim": 1024,
+                        "n_layers": 2
+                    },
+                    "char_cnn": {
+                        "activation": "relu",
+                        "filters": [[1, 32], [2, 32], [3, 64], [4, 128],
+                                [5, 256], [6, 512], [7, 1024]],
+                        "n_highway": 1,
+                        "embedding": {
+                            "dim": 16
+                        },
+                        "n_characters": 262,
+                        "max_characters_per_token": 50
+                    }
+                },
+                "num_output_representations": 2,
+                "requires_grad": False,
+                "do_layer_norm": False,
+                "dropout": 0.5,
+                "vocab_to_cache": None,
+                "keep_sentence_boundaries": False,
+                "scalar_mix_parameters": None,
+                "name": "elmo_encoder",
+            }
+
+        Here:
+
+        The default parameters are values for ELMo small model.
+
+        `"pretrained_model_name"`: str or None
+            The name of the pre-trained ELMo model. If None, the model
+            will be randomly initialized.
+
+        `"encoder"`: dict
+            Hyperparameters for ELMo encoder.
+
+        `"num_output_representations"`: int
+            The number of ELMo representation to output with different linear
+            weighted combination of the 3 layers (i.e., character-convnet
+            output, the first LSTM output, the second LSTM output).
+
+        `"requires_grad"`: bool
+            If True, compute gradient of ELMo parameters for fine tuning.
+
+        `"do_layer_norm"`: bool
+            Should we apply layer normalization (passed to `ScalarMix`)?
+
+        `"dropout"`: float
+            The dropout to be applied to the ELMo representations.
+
+        `"vocab_to_cache"`: List[string]
+            A list of words to pre-compute and cache character convolutions
+            for. If you use this option, ELMo expects that you pass word
+            indices of shape `(batch_size, timesteps)` to forward, instead
+            of character indices. If you use this option and pass a word which
+            was not pre-cached, this will break.
+
+        `"keep_sentence_boundaries"`: bool
+            If True, the representation of the sentence boundary tokens are
+            not removed.
+
+        `"scalar_mix_parameters"`: List[float]
+            If not `None`, use these scalar mix parameters to weight the
+            representations produced by different layers. These mixing weights
+            are not updated during training. The mixing weights here should be
+            the unnormalized (i.e., pre-softmax) weights. So, if you wanted to
+            use only the 1st layer of a 2-layer ELMo, you can set this to
+            [-9e10, 1, -9e10 ].
+
+        `"name"`: str
+            Name of the module.
+        """
+        return {
+            'pretrained_model_name': 'elmo-small',
+            'encoder': {
+                "lstm": {
+                    "use_skip_connections": True,
+                    "projection_dim": 128,
+                    "cell_clip": 3,
+                    "proj_clip": 3,
+                    "dim": 1024,
+                    "n_layers": 2
+                },
+                "char_cnn": {
+                    "activation": "relu",
+                    "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
+                                [6, 512], [7, 1024]],
+                    "n_highway": 1,
+                    "embedding": {
+                        "dim": 16
+                    },
+                    "n_characters": 262,
+                    "max_characters_per_token": 50
+                }
+            },
+            'num_output_representations': 2,
+            'requires_grad': False,
+            'do_layer_norm': False,
+            'dropout': 0.5,
+            'vocab_to_cache': None,
+            'keep_sentence_boundaries': False,
+            'scalar_mix_parameters': None,
+            'name': 'elmo_encoder',
+            '@no_typecheck': ['pretrained_model_name']
+        }
+
+    def forward(self,  # type: ignore
+                inputs: torch.Tensor,
+                word_inputs: Optional[torch.Tensor] = None) -> \
+            Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
+        r"""Encodes the inputs.
+
+        Args:
+            inputs: Shape `[batch_size, max_time, 50]` of character ids
+                representing the current batch.
+            word_inputs: If you passed a cached vocab, you can in addition pass
+                a tensor of shape `[batch_size, max_time]`, which represent
+                word ids which have been pre-cached.
+
+        Returns:
+            A Dictionary with keys:
+
+            - :attr:`elmo_representations`: A `num_output_representations` list
+              of ELMo representations for the input sequence. Each
+              representation is shape `[batch_size, max_time, embedding_dim]`
+
+            - :attr:`mask`: Shape `(batch_size, timesteps)` long tensor
+              with sequence mask.
+        """
+        # reshape the input if needed
+        original_shape = inputs.size()
+        if len(original_shape) > 3:
+            timesteps, num_characters = original_shape[-2:]
+            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
+        else:
+            reshaped_inputs = inputs
+
+        if word_inputs is not None:
+            original_word_size = word_inputs.size()
+            if self._has_cached_vocab and len(original_word_size) > 2:
+                reshaped_word_inputs = word_inputs.view(-1,
+                                                        original_word_size[-1])
+            elif not self._has_cached_vocab:
+                warnings.warn(
+                    "Word inputs were passed to ELMo but it does not have a "
+                    "cached vocab.")
+                reshaped_word_inputs = None  # type: ignore
+            else:
+                reshaped_word_inputs = word_inputs
+        else:
+            reshaped_word_inputs = word_inputs  # type: ignore
+
+        # run the biLM
+        bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs)
+        layer_activations = bilm_output["activations"]
+        mask_with_bos_eos = bilm_output["mask"]
+
+        # compute the elmo representations
+        representations = []
+        for i in range(len(self._scalar_mixes)):
+            scalar_mix = getattr(self, "scalar_mix_{}".format(i))
+            representation_with_bos_eos = scalar_mix(layer_activations,
+                                                     mask_with_bos_eos)
+            if self._keep_sentence_boundaries:
+                processed_representation = representation_with_bos_eos
+                processed_mask = mask_with_bos_eos
+            else:
+                representation_without_bos_eos, mask_without_bos_eos = \
+                    remove_sentence_boundaries(
+                        representation_with_bos_eos, mask_with_bos_eos)
+                processed_representation = representation_without_bos_eos
+                processed_mask = mask_without_bos_eos
+            representations.append(self._dropout(processed_representation))
+
+        # reshape if necessary
+        if word_inputs is not None and len(original_word_size) > 2:
+            mask = processed_mask.view(original_word_size)
+            elmo_representations = [
+                representation.view(original_word_size + (-1,))
+                for representation in representations]
+        elif len(original_shape) > 3:
+            mask = processed_mask.view(original_shape[:-1])
+            elmo_representations = [
+                representation.view(original_shape[:-1] + (-1,))
+                for representation in representations]
+        else:
+            mask = processed_mask
+            elmo_representations = representations
+
+        return {"elmo_representations": elmo_representations, "mask": mask}
+
+    @property
+    def output_size(self):
+        return self._elmo_lstm.get_output_dim()
diff --git a/texar/torch/modules/encoders/elmo_encoder_test.py b/texar/torch/modules/encoders/elmo_encoder_test.py
new file mode 100644
index 000000000..04a34b359
--- /dev/null
+++ b/texar/torch/modules/encoders/elmo_encoder_test.py
@@ -0,0 +1,146 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for ELMo Encoder.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py`
+"""
+
+import unittest
+
+from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids
+from texar.torch.modules.encoders.elmo_encoder import ELMoEncoder
+from texar.torch.utils.test import pretrained_test
+
+
+class ELMoEncoderTest(unittest.TestCase):
+    r"""Tests :class:`~texar.torch.modules.ELMoEncoder` class.
+    """
+
+    @pretrained_test
+    def test_model_loading(self):
+        r"""Tests model loading functionality."""
+        sentences = [
+            ["The", "sentence", "."],
+            ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."],
+        ]
+        character_ids = batch_to_ids(sentences)
+        for pretrained_model_name in ELMoEncoder.available_checkpoints():
+            encoder = ELMoEncoder(pretrained_model_name=pretrained_model_name)
+            _ = encoder(character_ids)
+
+    def test_encode(self):
+        r"""Tests encoding.
+        """
+        hparams = {
+            "pretrained_model_name": None,
+            'encoder': {
+                "lstm": {
+                    "cell_clip": 3,
+                    "use_skip_connections": True,
+                    "n_layers": 2,
+                    "proj_clip": 3,
+                    "projection_dim": 16,
+                    "dim": 64
+                },
+                "char_cnn": {
+                    "embedding": {
+                        "dim": 4
+                    },
+                    "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]],
+                    "n_highway": 2,
+                    "n_characters": 262,
+                    "max_characters_per_token": 50,
+                    "activation": "relu"
+                }
+            }
+        }
+        encoder = ELMoEncoder(hparams=hparams)
+
+        sentences = [
+            ["The", "sentence", "."],
+            ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."],
+        ]
+        character_ids = batch_to_ids(sentences)
+        output = encoder(character_ids)
+        elmo_representations = output["elmo_representations"]
+        mask = output["mask"]
+
+        assert len(elmo_representations) == 2
+        assert list(elmo_representations[0].size()) == [2, 7, 32]
+        assert list(elmo_representations[1].size()) == [2, 7, 32]
+        assert list(mask.size()) == [2, 7]
+
+    def test_elmo_keep_sentence_boundaries(self):
+        hparams = {
+            "pretrained_model_name": None,
+            'encoder': {
+                "lstm": {
+                    "cell_clip": 3,
+                    "use_skip_connections": True,
+                    "n_layers": 2,
+                    "proj_clip": 3,
+                    "projection_dim": 16,
+                    "dim": 64
+                },
+                "char_cnn": {
+                    "embedding": {
+                        "dim": 4
+                    },
+                    "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]],
+                    "n_highway": 2,
+                    "n_characters": 262,
+                    "max_characters_per_token": 50,
+                    "activation": "relu"
+                }
+            },
+            'dropout': 0.0,
+            'keep_sentence_boundaries': True,
+        }
+        encoder = ELMoEncoder(hparams=hparams)
+
+        sentences = [
+            ["The", "sentence", "."],
+            ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."],
+        ]
+        character_ids = batch_to_ids(sentences)
+        output = encoder(character_ids)
+        elmo_representations = output["elmo_representations"]
+        mask = output["mask"]
+
+        assert len(elmo_representations) == 2
+        # Add 2 to the lengths because we're keeping the start and end of
+        # sentence tokens.
+        assert list(elmo_representations[0].size()) == [2, 7 + 2, 32]
+        assert list(elmo_representations[1].size()) == [2, 7 + 2, 32]
+        assert list(mask.size()) == [2, 7 + 2]
+
+    @pretrained_test
+    def test_trainable_variables(self):
+        encoder = ELMoEncoder()
+        elmo_grads = [
+            param.requires_grad for param in encoder._elmo_lstm.parameters()
+        ]
+        assert all(grad is False for grad in elmo_grads)
+
+        encoder = ELMoEncoder(hparams={'requires_grad': True})
+        elmo_grads = [
+            param.requires_grad for param in encoder._elmo_lstm.parameters()
+        ]
+        assert all(grad is True for grad in elmo_grads)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/texar/torch/modules/pretrained/__init__.py b/texar/torch/modules/pretrained/__init__.py
index 1f06a87a9..1e0ae19d3 100644
--- a/texar/torch/modules/pretrained/__init__.py
+++ b/texar/torch/modules/pretrained/__init__.py
@@ -17,6 +17,7 @@
 
 from texar.torch.modules.pretrained.pretrained_base import *
 from texar.torch.modules.pretrained.bert import *
+from texar.torch.modules.pretrained.elmo import *
 from texar.torch.modules.pretrained.gpt2 import *
 from texar.torch.modules.pretrained.roberta import *
 from texar.torch.modules.pretrained.xlnet import *
diff --git a/texar/torch/modules/pretrained/elmo.py b/texar/torch/modules/pretrained/elmo.py
new file mode 100644
index 000000000..2783aa4e5
--- /dev/null
+++ b/texar/torch/modules/pretrained/elmo.py
@@ -0,0 +1,104 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utils of ELMo Modules.
+"""
+
+import json
+import os
+
+from abc import ABC
+from typing import Any, Dict
+
+from texar.torch.modules.pretrained.pretrained_base import PretrainedMixin
+
+__all__ = [
+    "PretrainedELMoMixin",
+]
+
+_ELMo_PATH = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/"
+
+
+class PretrainedELMoMixin(PretrainedMixin, ABC):
+    r"""A mixin class to support loading pre-trained checkpoints for modules
+    that implement the ELMo model.
+
+    The ELMo model was proposed in
+    `Deep contextualized word representations`_
+    by `Peters et al.` from Allen Institute for Artificial Intelligence. It is
+    a deep bidirectional language model (`biLM`), which is pre-trained on a
+    large text corpus.
+
+    The available ELMo models are as follows:
+
+      * ``elmo-small``: 13.6M parameters, trained on 800M tokens.
+      * ``elmo-medium``: 28.0M parameters, trained on 800M tokens.
+      * ``elmo-original``: 93.6M parameters, trained on 800M tokens.
+      * ``elmo-original-5.5b``: 93.6M parameters, trained on 5.5B tokens.
+
+    We provide the following ELMo classes:
+
+      * :class:`~texar.torch.modules.ELMoEncoder` for text encoding.
+
+    .. _`Deep contextualized word representations`:
+        https://arxiv.org/abs/1802.05365
+    """
+    _MODEL_NAME = "ELMo"
+    _MODEL2URL = {
+        'elmo-small': [
+            _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/'
+                         'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5',
+            _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/'
+                         'elmo_2x1024_128_2048cnn_1xhighway_options.json',
+        ],
+        'elmo-medium': [
+            _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/'
+                         'elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5',
+            _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/'
+                         'elmo_2x2048_256_2048cnn_1xhighway_options.json',
+        ],
+        'elmo-original': [
+            _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/'
+                         'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5',
+            _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/'
+                         'elmo_2x4096_512_2048cnn_2xhighway_options.json',
+        ],
+        'elmo-original-5.5b': [
+            _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/'
+                         'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5',
+            _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/'
+                         'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json',
+        ],
+    }
+
+    @classmethod
+    def _transform_config(cls, pretrained_model_name: str,
+                          cache_dir: str) -> Dict[str, Any]:
+        info = list(os.walk(cache_dir))
+        root, _, files = info[0]
+        config_path = None
+        for file in files:
+            if file.endswith('options.json'):
+                config_path = os.path.join(root, file)
+        if config_path is None:
+            raise ValueError(f"Cannot find the config file in {cache_dir}")
+
+        with open(config_path) as f:
+            config_elmo = json.loads(f.read())
+
+        return {'encoder': config_elmo}
+
+    def _init_from_checkpoint(self, pretrained_model_name: str,
+                              cache_dir: str, **kwargs):
+        return
diff --git a/texar/torch/modules/pretrained/elmo_test.py b/texar/torch/modules/pretrained/elmo_test.py
new file mode 100644
index 000000000..d31bb1f5a
--- /dev/null
+++ b/texar/torch/modules/pretrained/elmo_test.py
@@ -0,0 +1,71 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for ELMo utils.
+"""
+
+import os
+import unittest
+
+from texar.torch.modules.pretrained.elmo import *
+from texar.torch.utils.test import pretrained_test
+
+
+class ELMoUtilsTest(unittest.TestCase):
+    r"""Tests ELMo Utils.
+    """
+
+    @pretrained_test
+    def test_load_pretrained_elmo_AND_transform_elmo_to_texar_config(self):
+        pretrained_model_dir = PretrainedELMoMixin.download_checkpoint(
+            pretrained_model_name="elmo-small")
+
+        info = list(os.walk(pretrained_model_dir))
+        _, _, files = info[0]
+        self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', files)
+        self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_options.json', files)
+
+        model_config = PretrainedELMoMixin._transform_config(
+            pretrained_model_name="elmo-small",
+            cache_dir=pretrained_model_dir)
+
+        exp_config = {
+            'encoder': {
+                "lstm": {
+                    "use_skip_connections": True,
+                    "projection_dim": 128,
+                    "cell_clip": 3,
+                    "proj_clip": 3,
+                    "dim": 1024,
+                    "n_layers": 2
+                },
+                "char_cnn": {
+                    "activation": "relu",
+                    "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
+                                [6, 512], [7, 1024]],
+                    "n_highway": 1,
+                    "embedding": {
+                        "dim": 16
+                    },
+                    "n_characters": 262,
+                    "max_characters_per_token": 50
+                }
+            },
+        }
+
+        self.assertDictEqual(model_config, exp_config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/texar/torch/modules/pretrained/elmo_utils.py b/texar/torch/modules/pretrained/elmo_utils.py
new file mode 100644
index 000000000..be6e76c5d
--- /dev/null
+++ b/texar/torch/modules/pretrained/elmo_utils.py
@@ -0,0 +1,1710 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utils of ELMo Modules.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo_lstm.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/encoder_base.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/lstm_cell_with_projection.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/highway.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/scalar_mix.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/time_distributed.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/embedding.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/nn/initializers.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py`
+"""
+import itertools
+import json
+import logging
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import h5py
+import numpy
+import torch
+
+from torch.nn import ParameterList, Parameter
+from torch.nn.functional import embedding
+from torch.nn.utils.rnn import (
+    pad_packed_sequence, pack_padded_sequence, PackedSequence)
+
+from texar.torch.data.tokenizers.elmo_tokenizer_utils import (
+    batch_to_ids, ELMoCharacterMapper)
+from texar.torch.utils.utils import (
+    combine_initial_dims, get_device_of, lazy_groups_of, sort_batch_by_length,
+    uncombine_initial_dims)
+
+# pylint: disable=attribute-defined-outside-init,protected-access
+
+__all__ = [
+    "_ElmoBiLm",
+    "_ElmoCharacterEncoder",
+    "_EncoderBase",
+    "ElmoLstm",
+    "Embedding",
+    "Highway",
+    "LstmCellWithProjection",
+    "ScalarMix",
+    "TimeDistributed",
+    "add_sentence_boundary_token_ids",
+    "block_orthogonal",
+    "get_dropout_mask",
+    "remove_sentence_boundaries",
+]
+
+
+class _ElmoBiLm(torch.nn.Module):
+    r"""Run a pre-trained bidirectional language model, outputting the
+    activations at each layer for weighting together into an ELMo
+    representation.
+
+    Args:
+        options_file: ELMo JSON options file
+        weight_file: ELMo hdf5 weight file
+        requires_grad: If True, compute gradient of ELMo parameters for fine
+            tuning.
+        vocab_to_cache: A list of words to pre-compute and cache character
+            convolutions for. If you use this option, `_ElmoBiLm` expects that
+            you pass word indices of shape `(batch_size, timesteps)` to forward,
+            instead of character indices. If you use this option and pass a word
+            which wasn't pre-cached, this will break.
+    """
+    def __init__(self, options_file: str, weight_file: Optional[str] = None,
+                 requires_grad: bool = False,
+                 vocab_to_cache: Optional[List[str]] = None) -> None:
+        super().__init__()
+        self._token_embedder = _ElmoCharacterEncoder(
+            options_file, weight_file, requires_grad=requires_grad)
+        self._requires_grad = requires_grad
+        # This is an embedding, used to look up cached
+        # word vectors built from character level cnn embeddings.
+        self._word_embedding = None
+        self._bos_embedding: torch.Tensor = None  # type: ignore
+        self._eos_embedding: torch.Tensor = None  # type: ignore
+        if vocab_to_cache:
+            logging.info(
+                "Caching character cnn layers for words in vocabulary.")
+            # This sets 3 attributes, _word_embedding, _bos_embedding and
+            # _eos_embedding. They are set in the method so they can be accessed
+            # from outside the constructor.
+            self.create_cached_cnn_embeddings(vocab_to_cache)
+
+        with open(options_file, "r") as fin:
+            options = json.load(fin)
+        if not options["lstm"].get("use_skip_connections"):
+            raise ValueError(
+                "We only support pretrained biLMs with residual connections")
+        self._elmo_lstm = ElmoLstm(
+            input_size=options["lstm"]["projection_dim"],
+            hidden_size=options["lstm"]["projection_dim"],
+            cell_size=options["lstm"]["dim"],
+            num_layers=options["lstm"]["n_layers"],
+            memory_cell_clip_value=options["lstm"]["cell_clip"],
+            state_projection_clip_value=options["lstm"]["proj_clip"],
+            requires_grad=requires_grad)
+
+        if weight_file is not None:
+            self._elmo_lstm.load_weights(weight_file)
+        # Number of representation layers including context independent layer
+        self.num_layers = options["lstm"]["n_layers"] + 1
+
+    def get_output_dim(self):
+        return 2 * self._token_embedder.get_output_dim()
+
+    def forward(  # type: ignore
+        self, inputs: torch.Tensor, word_inputs: Optional[torch.Tensor] = None
+    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
+        r"""Encodes the inputs.
+
+        Args:
+            inputs: Shape `(batch_size, timesteps, 50)` of character ids
+                representing the current batch.
+            word_inputs: If you passed a cached vocab, you can in addition pass
+                a tensor of shape `(batch_size, timesteps)`, which represent
+                word ids which have been pre-cached.
+
+        Returns:
+            Dict with keys:
+
+            - `'activations'`: A list of activations at each layer of the
+              network, each of shape `(batch_size, timesteps + 2,
+              embedding_dim)`.
+            - `'mask'`: Shape `(batch_size, timesteps + 2)` long tensor with
+              sequence mask.
+
+            Note that the output tensors all include additional special begin
+            and end of sequence markers.
+        """
+        if self._word_embedding is not None and word_inputs is not None:
+            try:
+                mask_without_bos_eos = (word_inputs > 0).long()
+                # The character cnn part is cached - just look it up.
+                embedded_inputs = self._word_embedding(
+                    word_inputs)
+                # shape (batch_size, timesteps + 2, embedding_dim)
+                type_representation, mask = add_sentence_boundary_token_ids(
+                    embedded_inputs, mask_without_bos_eos, self._bos_embedding,
+                    self._eos_embedding)
+            except RuntimeError:
+                # Back off to running the character convolutions,
+                # as we might not have the words in the cache.
+                token_embedding = self._token_embedder(inputs)
+                mask = token_embedding["mask"]
+                type_representation = token_embedding["token_embedding"]
+        else:
+            token_embedding = self._token_embedder(inputs)
+            mask = token_embedding["mask"]
+            type_representation = token_embedding["token_embedding"]
+        lstm_outputs = self._elmo_lstm(type_representation, mask)
+
+        # Prepare the output.  The first layer is duplicated.
+        # Because of minor differences in how masking is applied depending
+        # on whether the char cnn layers are cached, we'll be defensive and
+        # multiply by the mask here. It's not strictly necessary, as the
+        # mask passed on is correct, but the values in the padded areas
+        # of the char cnn representations can change.
+        output_tensors = [
+            torch.cat([type_representation, type_representation], dim=-1)
+            * mask.float().unsqueeze(-1)]
+        for layer_activations in torch.chunk(lstm_outputs,
+                                             lstm_outputs.size(0), dim=0):
+            output_tensors.append(layer_activations.squeeze(0))
+
+        return {"activations": output_tensors, "mask": mask}
+
+    def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:
+        r"""Given a list of tokens, this method precomputes word representations
+        by running just the character convolutions and highway layers of elmo,
+        essentially creating uncontextual word vectors. On subsequent forward
+        passes, the word ids are looked up from an embedding, rather than being
+        computed on the fly via the CNN encoder.
+
+        This function sets 3 attributes:
+
+        _word_embedding: The word embedding for each word in the tokens passed
+            to this method.
+        _bos_embedding: The embedding for the BOS token.
+        _eos_embedding: The embedding for the EOS token.
+
+        Args:
+            tokens: A list of tokens to precompute character convolutions for.
+        """
+        tokens = [ELMoCharacterMapper.bos_token,
+                  ELMoCharacterMapper.eos_token] + tokens
+        timesteps = 32
+        batch_size = 32
+        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)
+
+        all_embeddings = []
+        device = get_device_of(next(self.parameters()))
+        for batch in lazy_groups_of(chunked_tokens, batch_size):
+            # Shape (batch_size, timesteps, 50)
+            batched_tensor = batch_to_ids(batch)
+            # NOTE: This device check is for when a user calls this method
+            # having already placed the model on a device. If this is called in
+            # the constructor, it will probably happen on the CPU. This isn't
+            # too bad, because it's only a few convolutions and will likely
+            # be very fast.
+            if device >= 0:
+                batched_tensor = batched_tensor.cuda(device)
+            output = self._token_embedder(batched_tensor)
+            token_embedding = output["token_embedding"]
+            mask = output["mask"]
+            token_embedding, _ = remove_sentence_boundaries(token_embedding,
+                                                            mask)
+            all_embeddings.append(token_embedding.view(
+                -1, token_embedding.size(-1)))
+        full_embedding = torch.cat(all_embeddings, 0)
+
+        # We might have some trailing embeddings from padding in the batch, so
+        # we clip the embedding and lookup to the right size.
+        full_embedding = full_embedding[: len(tokens), :]
+        embedding_ = full_embedding[2: len(tokens), :]
+        vocab_size, embedding_dim = list(embedding_.size())
+
+        self._bos_embedding = full_embedding[0, :]
+        self._eos_embedding = full_embedding[1, :]
+        self._word_embedding = Embedding(  # type: ignore
+            vocab_size,
+            embedding_dim,
+            weight=embedding_.data,
+            trainable=self._requires_grad,
+            padding_index=0)
+
+
+class _ElmoCharacterEncoder(torch.nn.Module):
+    r"""Compute context insensitive token representation using pre-trained biLM.
+
+    This embedder has input character ids of size
+    `(batch_size, sequence_length, 50)`
+    and returns `(batch_size, sequence_length + 2, embedding_dim)`, where
+    `embedding_dim` is specified in the options file (typically 512).
+
+    We add special entries at the beginning and end of each sequence
+    corresponding to <S> and </S>, the beginning and end of sentence tokens.
+
+    Args:
+        options_file: ELMo JSON options file.
+        weight_file: ELMo hdf5 weight file.
+        requires_grad: If True, compute gradient of ELMo parameters for fine
+            tuning.
+    """
+    def __init__(self, options_file: str, weight_file: Optional[str] = None,
+                 requires_grad: bool = False) -> None:
+        super().__init__()
+
+        with open(options_file, "r") as fin:
+            self._options = json.load(fin)
+        self._weight_file = weight_file
+
+        self.output_dim = self._options["lstm"]["projection_dim"]
+        self.requires_grad = requires_grad
+
+        if weight_file is not None:
+            self._load_weights()
+        else:
+            # Do not load the weights
+            self._load_weights(False)
+
+        # Cache the arrays for use in forward -- +1 due to masking.
+        self._beginning_of_sentence_characters = torch.from_numpy(
+            numpy.array(
+                ELMoCharacterMapper.beginning_of_sentence_characters) + 1)
+        self._end_of_sentence_characters = torch.from_numpy(
+            numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
+
+    def get_output_dim(self):
+        return self.output_dim
+
+    def forward(self,  # type: ignore
+                inputs: torch.Tensor) -> Dict[str, torch.Tensor]:
+        r"""Compute context insensitive token embeddings for ELMo
+        representations.
+
+        Args:
+            inputs: Shape `(batch_size, sequence_length, 50)` of character ids
+                representing the current batch.
+
+        Returns:
+            Dict with keys:
+
+            - `'token_embedding'`: Shape `(batch_size, sequence_length + 2,
+              embedding_dim)` tensor with context insensitive token
+              representations.
+            - `'mask'`: Shape `(batch_size, sequence_length + 2)` long tensor
+              with sequence mask.
+        """
+        # Add BOS/EOS
+        mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
+        character_ids_with_bos_eos, mask_with_bos_eos = \
+            add_sentence_boundary_token_ids(
+                inputs, mask, self._beginning_of_sentence_characters,
+                self._end_of_sentence_characters)
+
+        # the character id embedding
+        max_chars_per_token = \
+            self._options["char_cnn"]["max_characters_per_token"]
+        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
+        character_embedding = torch.nn.functional.embedding(
+            character_ids_with_bos_eos.view(-1, max_chars_per_token),
+            self._char_embedding_weights)
+
+        # run convolutions
+        cnn_options = self._options["char_cnn"]
+        activation: Callable
+        if cnn_options["activation"] == "tanh":
+            activation = torch.tanh
+        elif cnn_options["activation"] == "relu":
+            activation = torch.nn.functional.relu
+        else:
+            raise ValueError("Unknown activation")
+
+        # (batch_size * sequence_length, embed_dim, max_chars_per_token)
+        character_embedding = torch.transpose(character_embedding, 1, 2)
+        convs = []
+        for i in range(len(self._convolutions)):
+            conv = getattr(self, "char_conv_{}".format(i))
+            convolved = conv(character_embedding)
+            # (batch_size * sequence_length, n_filters for this width)
+            convolved, _ = torch.max(convolved, dim=-1)
+            convolved = activation(convolved)
+            convs.append(convolved)
+
+        # (batch_size * sequence_length, n_filters)
+        token_embedding = torch.cat(convs, dim=-1)
+        # apply the highway layers (batch_size * sequence_length, n_filters)
+        token_embedding = self._highways(token_embedding)
+        # final projection  (batch_size * sequence_length, embedding_dim)
+        token_embedding = self._projection(token_embedding)
+        # reshape to (batch_size, sequence_length, embedding_dim)
+        batch_size, sequence_length, _ = character_ids_with_bos_eos.size()
+
+        return {"mask": mask_with_bos_eos,
+                "token_embedding": token_embedding.view(
+                    batch_size, sequence_length, -1)}
+
+    def _load_weights(self, load_weights=True):
+        self._load_char_embedding(load_weights)
+        self._load_cnn_weights(load_weights)
+        self._load_highway(load_weights)
+        self._load_projection(load_weights)
+
+    def _load_char_embedding(self, load_weights):
+        if load_weights:
+            with h5py.File(self._weight_file, "r") as fin:
+                char_embed_weights = fin["char_embed"][...]
+
+            weights = numpy.zeros(
+                (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]),
+                dtype="float32")
+            weights[1:, :] = char_embed_weights
+
+            self._char_embedding_weights = torch.nn.Parameter(
+                torch.FloatTensor(weights), requires_grad=self.requires_grad)
+        else:
+            weights = numpy.zeros((
+                self._options['char_cnn']['n_characters'],
+                self._options['char_cnn']['embedding']['dim']), dtype="float32")
+            self._char_embedding_weights = torch.nn.Parameter(
+                torch.FloatTensor(weights), requires_grad=self.requires_grad)
+
+    def _load_cnn_weights(self, load_weights):
+        cnn_options = self._options["char_cnn"]
+        filters = cnn_options["filters"]
+        char_embed_dim = cnn_options["embedding"]["dim"]
+
+        convolutions = []
+        for i, (width, num) in enumerate(filters):
+            conv = torch.nn.Conv1d(in_channels=char_embed_dim, out_channels=num,
+                                   kernel_size=width, bias=True)
+            if load_weights:
+                # load the weights
+                with h5py.File(self._weight_file, "r") as fin:
+                    weight = fin["CNN"]["W_cnn_{}".format(i)][...]
+                    bias = fin["CNN"]["b_cnn_{}".format(i)][...]
+
+                w_reshaped = numpy.transpose(weight.squeeze(axis=0),
+                                             axes=(2, 1, 0))
+                if w_reshaped.shape != tuple(conv.weight.data.shape):
+                    raise ValueError("Invalid weight file")
+                conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
+                conv.bias.data.copy_(torch.FloatTensor(bias))
+                conv.weight.requires_grad = self.requires_grad
+                conv.bias.requires_grad = self.requires_grad
+
+            convolutions.append(conv)
+            self.add_module("char_conv_{}".format(i), conv)
+        self._convolutions = convolutions
+
+    def _load_highway(self, load_weights):
+        # the highway layers have same dimensionality as the number of cnn
+        # filters
+        cnn_options = self._options["char_cnn"]
+        filters = cnn_options["filters"]
+        n_filters = sum(f[1] for f in filters)
+        n_highway = cnn_options["n_highway"]
+
+        # create the layers, and load the weights
+        self._highways = Highway(n_filters, n_highway,
+                                 activation=torch.nn.functional.relu)
+        if load_weights:
+            for k in range(n_highway):
+                # The AllenNLP highway is one matrix multplication with
+                # concatenation of transform and carry weights.
+                with h5py.File(self._weight_file, "r") as fin:
+                    # The weights are transposed due to multiplication order
+                    # assumptions in tf vs pytorch (tf.matmul(X, W) vs
+                    # pytorch.matmul(W, X))
+                    w_transform = numpy.transpose(
+                        fin["CNN_high_{}".format(k)]["W_transform"][...])
+                    # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but
+                    # tf is (1 - g) * x + g * f(x)
+                    w_carry = -1.0 * numpy.transpose(
+                        fin["CNN_high_{}".format(k)]["W_carry"][...])
+                    weight = numpy.concatenate([w_transform, w_carry], axis=0)
+                    self._highways._layers[k].weight.data.copy_(
+                        torch.FloatTensor(weight))
+                    self._highways._layers[k].weight.requires_grad = \
+                        self.requires_grad
+                    b_transform = \
+                        fin["CNN_high_{}".format(k)]["b_transform"][...]
+                    b_carry = \
+                        -1.0 * fin["CNN_high_{}".format(k)]["b_carry"][...]
+                    bias = numpy.concatenate([b_transform, b_carry], axis=0)
+                    self._highways._layers[k].bias.data.copy_(
+                        torch.FloatTensor(bias))
+                    self._highways._layers[k].bias.requires_grad = \
+                        self.requires_grad
+
+    def _load_projection(self, load_weights):
+        cnn_options = self._options["char_cnn"]
+        filters = cnn_options["filters"]
+        n_filters = sum(f[1] for f in filters)
+
+        self._projection = torch.nn.Linear(n_filters, self.output_dim,
+                                           bias=True)
+        if load_weights:
+            with h5py.File(self._weight_file, "r") as fin:
+                weight = fin["CNN_proj"]["W_proj"][...]
+                bias = fin["CNN_proj"]["b_proj"][...]
+                self._projection.weight.data.copy_(torch.FloatTensor(
+                    numpy.transpose(weight)))
+                self._projection.bias.data.copy_(torch.FloatTensor(bias))
+                self._projection.weight.requires_grad = self.requires_grad
+                self._projection.bias.requires_grad = self.requires_grad
+
+
+RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
+RnnStateStorage = Tuple[torch.Tensor, ...]
+
+
+class _EncoderBase(torch.nn.Module):
+    r"""This abstract class serves as a base for `Encoder`.
+
+    Additionally, this class provides functionality for sorting sequences by
+    length so they can be consumed by PyTorch RNN classes, which require their
+    inputs to be sorted by length. Finally, it also provides optional
+    statefulness to all of it's subclasses by allowing the caching and
+    retrieving of the hidden states of RNNs.
+    """
+    def __init__(self, stateful: bool = False) -> None:
+        super().__init__()
+        self.stateful = stateful
+        self._states: Optional[RnnStateStorage] = None
+
+    def sort_and_run_forward(
+            self,
+            module: Callable[[PackedSequence, Optional[RnnState]],
+                             Tuple[Union[PackedSequence, torch.Tensor],
+                                   RnnState]],
+            inputs: torch.Tensor, mask: torch.Tensor,
+            hidden_state: Optional[RnnState] = None):
+        r"""This function exists because PyTorch RNNs require that their inputs
+        be sorted before being passed as input. As all of our Seq2xxxEncoders
+        use this functionality, it is provided in a base class. This method can
+        be called on any module which takes as input a `PackedSequence` and
+        some `hidden_state`, which can either be a tuple of tensors or a tensor.
+
+        As all of our Seq2xxxEncoders have different return types, we return
+        `sorted` outputs from the module, which is called directly.
+        Additionally, we return the indices into the batch dimension required
+        to restore the tensor to it's correct, unsorted order and the number of
+        valid batch elements (i.e the number of elements in the batch which are
+        not completely masked). This un-sorting and re-padding of the module
+        outputs is left to the subclasses because their outputs have different
+        types and handling them smoothly here is difficult.
+
+        Args:
+            module: A function to run on the inputs. In most cases, this is a
+                `torch.nn.Module`.
+            inputs: A tensor of shape `(batch_size, sequence_length,
+                embedding_size)` representing the inputs to the Encoder.
+            mask: A tensor of shape `(batch_size, sequence_length)`,
+                representing masked and non-masked elements of the sequence for
+                each element in the batch.
+            hidden_state: A single tensor of shape `(num_layers, batch_size,
+                hidden_size)` representing the state of an RNN with or a tuple
+                of tensors of shapes `(num_layers, batch_size, hidden_size)` and
+                `(num_layers, batch_size, memory_size)`, representing the hidden
+                state and memory state of an LSTM-like RNN.
+
+        Returns:
+            module_output: A Tensor or `PackedSequence` representing the output
+                of the PyTorch Module. The batch size dimension will be equal to
+                `num_valid`, as sequences  of zero length are clipped off before
+                the module is called, as PyTorch cannot handle zero length
+                sequences.
+            final_states: A Tensor representing the hidden state of the PyTorch
+                Module. This can either be a single tensor of shape
+                `(num_layers, num_valid, hidden_size)`, for instance in the case
+                of a GRU, or a tuple of tensors, such as those required for an
+                LSTM.
+            restoration_indices: A tensor of shape `(batch_size,)`, describing
+                the re-indexing required to transform the outputs back to their
+                original batch order.
+        """
+        # In some circumstances you may have sequences of zero length.
+        # `pack_padded_sequence` requires all sequence lengths to be > 0, so
+        # remove sequences of zero length before calling self._module, then
+        # fill with zeros.
+
+        # First count how many sequences are empty.
+        batch_size = mask.size(0)
+        num_valid = torch.sum(mask[:, 0]).int().item()
+
+        sequence_lengths = mask.long().sum(-1)
+        (sorted_inputs, sorted_sequence_lengths, restoration_indices,
+         sorting_indices) = sort_batch_by_length(inputs, sequence_lengths)
+
+        # Now create a PackedSequence with only the non-empty, sorted sequences.
+        packed_sequence_input = pack_padded_sequence(
+            sorted_inputs[:num_valid, :, :],
+            sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True)
+        # Prepare the initial states.
+        if not self.stateful:
+            if hidden_state is None:
+                initial_states: Any = hidden_state
+            elif isinstance(hidden_state, tuple):
+                initial_states = [state.index_select(
+                    1, sorting_indices)[:, :num_valid, :].contiguous()
+                                  for state in hidden_state]
+            else:
+                initial_states = hidden_state.index_select(1, sorting_indices)[
+                    :, :num_valid, :].contiguous()
+        else:
+            initial_states = self._get_initial_states(batch_size, num_valid,
+                                                      sorting_indices)
+
+        # Actually call the module on the sorted PackedSequence.
+        module_output, final_states = module(packed_sequence_input,
+                                             initial_states)
+        return module_output, final_states, restoration_indices
+
+    def _get_initial_states(self, batch_size: int, num_valid: int,
+                            sorting_indices: torch.LongTensor) -> \
+            Optional[RnnState]:
+        r"""Returns an initial state for use in an RNN. Additionally, this
+        method handles the batch size changing across calls by mutating the
+        state to append initial states for new elements in the batch. Finally,
+        it also handles sorting the states with respect to the sequence lengths
+        of elements in the batch and removing rows which are completely padded.
+        Importantly, this `mutates` the state if the current batch size is
+        larger than when it was previously called.
+
+        Args:
+            batch_size: The batch size can change size across calls to stateful
+                RNNs, so we need to know if we need to expand or shrink the
+                states before returning them. Expanded states will be set to
+                zero.
+            num_valid: The batch may contain completely padded sequences which
+                get removed before the sequence is passed through the encoder.
+                We also need to clip these off of the state too.
+            sorting_indices: Pytorch RNNs take sequences sorted by length. When
+                we return the states to be used for a given call to
+                `module.forward`, we need the states to match up to the sorted
+                sequences, so before returning them, we sort the states using
+                the same indices used to sort the sequences.
+
+        Returns:
+            This method has a complex return type because it has to deal with
+            the first time it is called, when it has no state, and the fact that
+            types of RNN have heterogeneous states.
+
+            If it is the first time the module has been called, it returns
+            `None`, regardless of the type of the `Module`.
+
+            Otherwise, for LSTMs, it returns a tuple of `torch.Tensors` with
+            shape `(num_layers, num_valid, state_size)` and `(num_layers,
+            num_valid, memory_size)` respectively, for GRUs, it returns a single
+            `torch.Tensor` of shape `(num_layers, num_valid, state_size)`.
+        """
+        # We don't know the state sizes the first time calling forward,
+        # so we let the module define what it's initial hidden state looks like.
+        if self._states is None:
+            return None
+
+        # Otherwise, we have some previous states.
+        if batch_size > self._states[0].size(1):
+            # This batch is larger than the all previous states.
+            # If so, resize the states.
+            num_states_to_concat = batch_size - self._states[0].size(1)
+            resized_states = []
+            # state has shape (num_layers, batch_size, hidden_size)
+            for state in self._states:
+                # This _must_ be inside the loop because some
+                # RNNs have states with different last dimension sizes.
+                zeros = state.new_zeros(state.size(0), num_states_to_concat,
+                                        state.size(2))
+                resized_states.append(torch.cat([state, zeros], 1))
+            self._states = tuple(resized_states)
+            correctly_shaped_states = self._states
+        elif batch_size < self._states[0].size(1):
+            # This batch is smaller than the previous one.
+            correctly_shaped_states = tuple(state[:, :batch_size, :] for state
+                                            in self._states)
+        else:
+            correctly_shaped_states = self._states
+
+        # At this point, our states are of shape (num_layers, batch_size,
+        # hidden_size). However, the encoder uses sorted sequences and
+        # additionally removes elements of the batch which are fully padded.
+        # We need the states to match up to these sorted and filtered
+        # sequences, so we do that in the next two blocks before returning the
+        # state/s.
+        if len(self._states) == 1:
+            # GRUs only have a single state. This `unpacks` it from the
+            # tuple and returns the tensor directly.
+            correctly_shaped_state = correctly_shaped_states[0]
+            sorted_state = correctly_shaped_state.index_select(
+                1, sorting_indices)
+            return sorted_state[:, :num_valid, :].contiguous()
+        else:
+            # LSTMs have a state tuple of (state, memory).
+            sorted_states = [state.index_select(1, sorting_indices) for state in
+                             correctly_shaped_states]
+            return tuple(state[:, :num_valid, :].contiguous()  # type: ignore
+                         for state in sorted_states)
+
+    def _update_states(self, final_states: RnnStateStorage,
+                       restoration_indices: torch.LongTensor) -> None:
+        r"""After the RNN has run forward, the states need to be updated.
+        This method just sets the state to the updated new state, performing
+        several pieces of book-keeping along the way - namely, unsorting the
+        states and ensuring that the states of completely padded sequences are
+        not updated. Finally, it also detaches the state variable from the
+        computational graph, such that the graph can be garbage collected after
+        each batch iteration.
+
+        Args:
+            final_states: The hidden states returned as output from the RNN.
+            restoration_indices: The indices that invert the sorting used in
+                `sort_and_run_forward` to order the states with respect to the
+                lengths of the sequences in the batch.
+        """
+        new_unsorted_states = [state.index_select(1, restoration_indices) for
+                               state in final_states]
+
+        if self._states is None:
+            # We don't already have states, so just set the
+            # ones we receive to be the current state.
+            self._states = tuple(state.data for state in new_unsorted_states)
+        else:
+            # Now we've sorted the states back so that they correspond to the
+            # original indices, we need to figure out what states we need to
+            # update, because if we didn't use a state for a particular row,
+            # we want to preserve its state. Thankfully, the rows which are
+            # all zero in the state correspond exactly to those which aren't
+            # used, so we create masks of shape (new_batch_size,), denoting
+            # which states were used in the RNN computation.
+            current_state_batch_size = self._states[0].size(1)
+            new_state_batch_size = final_states[0].size(1)
+            # Masks for the unused states of shape (1, new_batch_size, 1)
+            used_new_rows_mask = [(state[0, :, :].sum(-1) != 0.0).float().view(
+                1, new_state_batch_size, 1) for state in new_unsorted_states]
+            new_states = []
+            if current_state_batch_size > new_state_batch_size:
+                # The new state is smaller than the old one,
+                # so just update the indices which we used.
+                for old_state, new_state, used_mask in zip(
+                        self._states, new_unsorted_states, used_new_rows_mask):
+                    # zero out all rows in the previous state
+                    # which _were_ used in the current state.
+                    masked_old_state = \
+                        old_state[:, :new_state_batch_size, :] * (1 - used_mask)
+                    # The old state is larger, so update the relevant parts of
+                    # it.
+                    old_state[:, :new_state_batch_size, :] = \
+                        new_state + masked_old_state
+                    new_states.append(old_state.detach())
+            else:
+                # The states are the same size, so we just have to
+                # deal with the possibility that some rows weren't used.
+                new_states = []
+                for old_state, new_state, used_mask in zip(
+                        self._states, new_unsorted_states, used_new_rows_mask):
+                    # zero out all rows which _were_ used in the current state.
+                    masked_old_state = old_state * (1 - used_mask)
+                    # The old state is larger, so update the relevant parts of
+                    # it.
+                    new_state += masked_old_state
+                    new_states.append(new_state.detach())
+
+            # It looks like there should be another case handled here - when
+            # the current_state_batch_size < new_state_batch_size. However,
+            # this never happens, because the states themeselves are mutated
+            # by appending zeros when calling _get_inital_states, meaning that
+            # the new states are either of equal size, or smaller, in the case
+            # that there are some unused elements (zero-length) for the RNN
+            # computation.
+            self._states = tuple(new_states)
+
+    def reset_states(self, mask: Optional[torch.Tensor] = None) -> None:
+        r"""Resets the internal states of a stateful encoder.
+
+        Args:
+            mask: A tensor of shape `(batch_size,)` indicating which states
+                should be reset. If not provided, all states will be reset.
+        """
+        if mask is None:
+            self._states = None
+        else:
+            # state has shape (num_layers, batch_size, hidden_size). We reshape
+            # mask to have shape (1, batch_size, 1) so that operations
+            # broadcast properly.
+            mask_batch_size = mask.size(0)
+            mask = mask.float().view(1, mask_batch_size, 1)
+            new_states = []
+            assert self._states is not None
+            for old_state in self._states:
+                old_state_batch_size = old_state.size(1)
+                if old_state_batch_size != mask_batch_size:
+                    raise ValueError(
+                        f"Trying to reset states using mask with incorrect "
+                        f"batch size. "
+                        f"Expected batch size: {old_state_batch_size}. "
+                        f"Provided batch size: {mask_batch_size}.")
+                new_state = (1 - mask) * old_state
+                new_states.append(new_state.detach())
+            self._states = tuple(new_states)
+
+
+class ElmoLstm(_EncoderBase):
+    r"""A stacked, bidirectional LSTM which uses `LstmCellWithProjection`'s
+    with highway layers between the inputs to layers. The inputs to the forward
+    and backward directions are independent - forward and backward states are
+    not concatenated between layers.
+
+    Additionally, this LSTM maintains its `own` state, which is updated every
+    time `forward` is called. It is dynamically resized for different batch
+    sizes and is designed for use with non-continuous inputs (i.e inputs which
+    aren't formatted as a stream, such as text used for a language modeling
+    task, which is how stateful RNNs are typically used).
+    This is non-standard, but can be thought of as having an "end of sentence"
+    state, which is carried across different sentences.
+
+    Args:
+        input_size: The dimension of the inputs to the LSTM.
+        hidden_size: The dimension of the outputs of the LSTM.
+        cell_size: The dimension of the memory cell of the
+            `LstmCellWithProjection`.
+        num_layers: The number of bidirectional LSTMs to use.
+        requires_grad: If True, compute gradient of ELMo parameters for fine
+            tuning.
+        recurrent_dropout_probability: The dropout probability to be used in a
+            dropout scheme as stated in [A Theoretically Grounded Application of
+            Dropout in Recurrent Neural Networks]
+            (https://arxiv.org/abs/1512.05287).
+        state_projection_clip_value: The magnitude with which to clip the
+            `hidden_state` after projecting it.
+        memory_cell_clip_value: The magnitude with which to clip the memory
+            cell.
+    """
+
+    def __init__(self, input_size: int, hidden_size: int, cell_size: int,
+                 num_layers: int, requires_grad: bool = False,
+                 recurrent_dropout_probability: float = 0.0,
+                 memory_cell_clip_value: Optional[float] = None,
+                 state_projection_clip_value: Optional[float] = None) -> None:
+        super().__init__(stateful=True)
+
+        # Required to be wrapped with a `PytorchSeq2SeqWrapper`.
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.cell_size = cell_size
+        self.requires_grad = requires_grad
+
+        forward_layers = []
+        backward_layers = []
+
+        lstm_input_size = input_size
+        go_forward = True
+        for layer_index in range(num_layers):
+            forward_layer = LstmCellWithProjection(
+                lstm_input_size, hidden_size, cell_size, go_forward,
+                recurrent_dropout_probability, memory_cell_clip_value,
+                state_projection_clip_value)
+            backward_layer = LstmCellWithProjection(
+                lstm_input_size, hidden_size, cell_size, not go_forward,
+                recurrent_dropout_probability, memory_cell_clip_value,
+                state_projection_clip_value)
+            lstm_input_size = hidden_size
+
+            self.add_module("forward_layer_{}".format(layer_index),
+                            forward_layer)
+            self.add_module("backward_layer_{}".format(layer_index),
+                            backward_layer)
+            forward_layers.append(forward_layer)
+            backward_layers.append(backward_layer)
+        self.forward_layers = forward_layers
+        self.backward_layers = backward_layers
+
+    def forward(self, inputs: torch.Tensor,  # type: ignore
+                mask: torch.LongTensor) -> torch.Tensor:
+        r"""Encodes the inputs.
+
+        Args:
+            inputs: A Tensor of shape
+                `(batch_size, sequence_length, hidden_size)`.
+            mask: A binary mask of shape `(batch_size, sequence_length)`
+                representing the non-padded elements in each sequence in the
+                batch.
+
+        Returns:
+            A `torch.Tensor` of shape `(num_layers, batch_size, sequence_length,
+            hidden_size)`, where the `num_layers` dimension represents the LSTM
+            output from that layer.
+        """
+        batch_size, total_sequence_length = mask.size()
+        stacked_sequence_output, final_states, restoration_indices = \
+            self.sort_and_run_forward(self._lstm_forward, inputs, mask)
+
+        num_layers, num_valid, returned_timesteps, encoder_dim = \
+            stacked_sequence_output.size()
+        # Add back invalid rows which were removed in the call to
+        # sort_and_run_forward.
+        if num_valid < batch_size:
+            zeros = stacked_sequence_output.new_zeros(
+                num_layers, batch_size - num_valid, returned_timesteps,
+                encoder_dim)
+            stacked_sequence_output = torch.cat(
+                [stacked_sequence_output, zeros], 1)
+            # The states also need to have invalid rows added back.
+            new_states = []
+            for state in final_states:
+                state_dim = state.size(-1)
+                zeros = state.new_zeros(num_layers, batch_size - num_valid,
+                                        state_dim)
+                new_states.append(torch.cat([state, zeros], 1))
+            final_states = new_states
+
+        # It's possible to need to pass sequences which are padded to longer
+        # than the max length of the sequence to a Seq2StackEncoder. However,
+        # packing and unpacking the sequences mean that the returned tensor
+        # won't include these dimensions, because the RNN did not need to
+        # process them. We add them back on in the form of zeros here.
+        sequence_length_difference = total_sequence_length - returned_timesteps
+        if sequence_length_difference > 0:
+            zeros = stacked_sequence_output.new_zeros(
+                num_layers, batch_size, sequence_length_difference,
+                stacked_sequence_output[0].size(-1))
+            stacked_sequence_output = torch.cat(
+                [stacked_sequence_output, zeros], 2)
+        self._update_states(final_states, restoration_indices)
+
+        # Restore the original indices and return the sequence.
+        # Has shape (num_layers, batch_size, sequence_length, hidden_size)
+        return stacked_sequence_output.index_select(1, restoration_indices)
+
+    def _lstm_forward(self, inputs: PackedSequence,
+                      initial_state: Optional[
+                          Tuple[torch.Tensor, torch.Tensor]] = None) -> \
+            Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        r"""Encodes the inputs.
+
+        Args:
+            inputs: A batch first `PackedSequence` to run the stacked LSTM over.
+            initial_state: A tuple (state, memory) representing the initial
+                hidden state and memory of the LSTM, with shape
+                `(num_layers, batch_size, 2 * hidden_size)` and
+                `(num_layers, batch_size, 2 * cell_size)` respectively.
+
+        Returns:
+            output_sequence: The encoded sequence of shape
+                `(num_layers, batch_size, sequence_length, hidden_size)`.
+            final_states: The per-layer final (state, memory) states of the
+                LSTM, with shape `(num_layers, batch_size, 2 * hidden_size)` and
+                `(num_layers, batch_size, 2 * cell_size)` respectively. The
+                last dimension is duplicated because it contains the
+                state/memory for both the forward and backward layers.
+        """
+        if initial_state is None:
+            hidden_states: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = \
+                [None] * len(self.forward_layers)
+        elif initial_state[0].size()[0] != len(self.forward_layers):
+            raise ValueError(
+                "Initial states were passed to forward() but the number of "
+                "initial states does not match the number of layers.")
+        else:
+            hidden_states = list(zip(initial_state[0].split(1, 0),
+                                     initial_state[1].split(1, 0)))
+
+        inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
+        forward_output_sequence = inputs
+        backward_output_sequence = inputs
+
+        final_states = []
+        sequence_outputs = []
+        for layer_index, state in enumerate(hidden_states):
+            forward_layer = getattr(self, "forward_layer_{}".format(
+                layer_index))
+            backward_layer = getattr(self, "backward_layer_{}".format(
+                layer_index))
+
+            forward_cache = forward_output_sequence
+            backward_cache = backward_output_sequence
+
+            forward_state = None
+            backward_state = None
+            if state is not None:
+                forward_hidden_state, backward_hidden_state = state[0].split(
+                    self.hidden_size, 2)
+                forward_memory_state, backward_memory_state = state[1].split(
+                    self.cell_size, 2)
+                forward_state = (forward_hidden_state, forward_memory_state)
+                backward_state = (backward_hidden_state, backward_memory_state)
+
+            forward_output_sequence, forward_state = forward_layer(
+                forward_output_sequence, batch_lengths, forward_state)
+            backward_output_sequence, backward_state = backward_layer(
+                backward_output_sequence, batch_lengths, backward_state)
+            # Skip connections, just adding the input to the output.
+            if layer_index != 0:
+                forward_output_sequence += forward_cache
+                backward_output_sequence += backward_cache
+
+            sequence_outputs.append(
+                torch.cat([forward_output_sequence, backward_output_sequence],
+                          -1))
+            # Append the state tuples in a list, so that we can return
+            # the final states for all the layers.
+            final_states.append(
+                (torch.cat([forward_state[0], backward_state[0]], -1),
+                 torch.cat([forward_state[1], backward_state[1]], -1)))
+
+        stacked_sequence_outputs: torch.FloatTensor = torch.stack(
+            sequence_outputs)
+        # Stack the hidden state and memory for each layer into 2 tensors of
+        # shape (num_layers, batch_size, hidden_size) and
+        # (num_layers, batch_size, cell_size) respectively.
+        final_hidden_states, final_memory_states = zip(*final_states)
+        final_state_tuple: Tuple[torch.FloatTensor, torch.FloatTensor] = (
+            torch.cat(final_hidden_states, 0),
+            torch.cat(final_memory_states, 0))
+        return stacked_sequence_outputs, final_state_tuple
+
+    def load_weights(self, weight_file: str) -> None:
+        r"""Load the pre-trained weights from the file.
+        """
+        requires_grad = self.requires_grad
+
+        with h5py.File(weight_file, "r") as fin:
+            for i_layer, lstms in enumerate(zip(self.forward_layers,
+                                                self.backward_layers)):
+                for j_direction, lstm in enumerate(lstms):
+                    # lstm is an instance of LSTMCellWithProjection
+                    cell_size = lstm.cell_size
+                    dataset = fin["RNN_%s" % j_direction]["RNN"][
+                        "MultiRNNCell"]["Cell%s" % i_layer]["LSTMCell"]
+                    # tensorflow packs together both W and U matrices into one
+                    # matrix, but pytorch maintains individual matrices.  In
+                    # addition, tensorflow packs the gates as input, memory,
+                    # forget, output but pytorch uses input, forget, memory,
+                    # output.  So we need to modify the weights.
+                    tf_weights = numpy.transpose(dataset["W_0"][...])
+                    torch_weights = tf_weights.copy()
+
+                    # split the W from U matrices
+                    input_size = lstm.input_size
+                    input_weights = torch_weights[:, :input_size]
+                    recurrent_weights = torch_weights[:, input_size:]
+                    tf_input_weights = tf_weights[:, :input_size]
+                    tf_recurrent_weights = tf_weights[:, input_size:]
+
+                    # handle the different gate order convention
+                    for torch_w, tf_w in [
+                        [input_weights, tf_input_weights],
+                        [recurrent_weights, tf_recurrent_weights]]:
+                        torch_w[(1 * cell_size): (2 * cell_size), :] = tf_w[
+                            (2 * cell_size): (3 * cell_size), :]
+                        torch_w[(2 * cell_size): (3 * cell_size), :] = tf_w[
+                            (1 * cell_size): (2 * cell_size), :]
+
+                    lstm.input_linearity.weight.data.copy_(torch.FloatTensor(
+                        input_weights))
+                    lstm.state_linearity.weight.data.copy_(torch.FloatTensor(
+                        recurrent_weights))
+                    lstm.input_linearity.weight.requires_grad = requires_grad
+                    lstm.state_linearity.weight.requires_grad = requires_grad
+
+                    # the bias weights
+                    tf_bias = dataset["B"][...]
+                    # tensorflow adds 1.0 to forget gate bias instead of
+                    # modifying the parameters...
+                    tf_bias[(2 * cell_size): (3 * cell_size)] += 1
+                    torch_bias = tf_bias.copy()
+                    torch_bias[(1 * cell_size): (2 * cell_size)] = tf_bias[
+                        (2 * cell_size): (3 * cell_size)]
+                    torch_bias[(2 * cell_size): (3 * cell_size)] = tf_bias[
+                        (1 * cell_size): (2 * cell_size)]
+                    lstm.state_linearity.bias.data.copy_(torch.FloatTensor(
+                        torch_bias))
+                    lstm.state_linearity.bias.requires_grad = requires_grad
+
+                    # the projection weights
+                    proj_weights = numpy.transpose(dataset["W_P_0"][...])
+                    lstm.state_projection.weight.data.copy_(torch.FloatTensor(
+                        proj_weights))
+                    lstm.state_projection.weight.requires_grad = requires_grad
+
+
+class LstmCellWithProjection(torch.nn.Module):
+    r"""An LSTM with Recurrent Dropout and a projected and clipped hidden state
+    and memory. Note: this implementation is slower than the native PyTorch
+    LSTM because it cannot make use of CUDNN optimizations for stacked RNNs due
+    to and variational dropout and the custom nature of the cell state.
+
+    Args:
+        input_size: The dimension of the inputs to the LSTM.
+        hidden_size: The dimension of the outputs of the LSTM.
+        cell_size: The dimension of the memory cell used for the LSTM.
+        go_forward: The direction in which the LSTM is applied to the sequence.
+            Forwards by default, or backwards if False.
+        recurrent_dropout_probability: The dropout probability to be used in a
+            dropout scheme as stated in [A Theoretically Grounded Application of
+            Dropout in Recurrent Neural Networks]
+            (https://arxiv.org/abs/1512.05287). Implementation wise, this simply
+            applies a fixed dropout mask per sequence to the recurrent
+            connection of the LSTM.
+        state_projection_clip_value: The magnitude with which to clip the
+            `hidden_state` after projecting it.
+        memory_cell_clip_value: The magnitude with which to clip the memory
+            cell.
+
+    Returns:
+        output_accumulator: The outputs of the LSTM for each timestep. A tensor
+            of shape `(batch_size, max_timesteps, hidden_size)` where for a
+            given batch element, all outputs past the sequence length for that
+            batch are zero tensors.
+        final_state: The final (state, memory) states of the LSTM, with shape
+            `(1, batch_size, hidden_size)` and `(1, batch_size, cell_size)`
+            respectively. The first dimension is 1 in order to match the PyTorch
+            API for returning stacked LSTM states.
+    """
+
+    def __init__(self, input_size: int, hidden_size: int, cell_size: int,
+                 go_forward: bool = True,
+                 recurrent_dropout_probability: float = 0.0,
+                 memory_cell_clip_value: Optional[float] = None,
+                 state_projection_clip_value: Optional[float] = None) -> None:
+        super().__init__()
+        # Required to be wrapped with a `PytorchSeq2SeqWrapper`.
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.cell_size = cell_size
+
+        self.go_forward = go_forward
+        self.state_projection_clip_value = state_projection_clip_value
+        self.memory_cell_clip_value = memory_cell_clip_value
+        self.recurrent_dropout_probability = recurrent_dropout_probability
+
+        # We do the projections for all the gates all at once.
+        self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size,
+                                               bias=False)
+        self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size,
+                                               bias=True)
+        # Additional projection matrix for making the hidden state smaller.
+        self.state_projection = torch.nn.Linear(cell_size, hidden_size,
+                                                bias=False)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # Use sensible default initializations for parameters.
+        block_orthogonal(self.input_linearity.weight.data,
+                         [self.cell_size, self.input_size])
+        block_orthogonal(self.state_linearity.weight.data,
+                         [self.cell_size, self.hidden_size])
+
+        self.state_linearity.bias.data.fill_(0.0)
+        # Initialize forget gate biases to 1.0 as per An Empirical
+        # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015).
+        self.state_linearity.bias.data[self.cell_size:
+                                       2 * self.cell_size].fill_(1.0)
+
+    def forward(self, inputs: torch.FloatTensor,  # type: ignore
+                batch_lengths: List[int],
+                initial_state: Optional[Tuple[torch.Tensor,
+                                              torch.Tensor]] = None):
+        r"""Process the inputs.
+
+        Args:
+            inputs: A tensor of shape `(batch_size, num_timesteps, input_size)`
+                to apply the LSTM over.
+            batch_lengths: A list of length batch_size containing the lengths
+                of the sequences in batch.
+            initial_state: A tuple (state, memory) representing the initial
+                hidden state and memory of the LSTM. The `state` has shape
+                `(1, batch_size, hidden_size)` and the `memory` has shape
+                `(1, batch_size, cell_size)`.
+
+        Returns:
+            output_accumulator: The outputs of the LSTM for each timestep. A
+                tensor of shape `(batch_size, max_timesteps, hidden_size)` where
+                for a given batch element, all outputs past the sequence length
+                for that batch are zero tensors.
+            final_state: A tuple (state, memory) representing the initial hidden
+                state and memory of the LSTM. The `state` has shape
+                `(1, batch_size, hidden_size)` and the `memory` has shape
+                `(1, batch_size, cell_size)`.
+        """
+        batch_size = inputs.size()[0]
+        total_timesteps = inputs.size()[1]
+        output_accumulator = inputs.new_zeros(batch_size, total_timesteps,
+                                              self.hidden_size)
+        if initial_state is None:
+            full_batch_previous_memory = inputs.new_zeros(batch_size,
+                                                          self.cell_size)
+            full_batch_previous_state = inputs.new_zeros(batch_size,
+                                                         self.hidden_size)
+        else:
+            full_batch_previous_state = initial_state[0].squeeze(0)
+            full_batch_previous_memory = initial_state[1].squeeze(0)
+
+        current_length_index = batch_size - 1 if self.go_forward else 0
+        if self.recurrent_dropout_probability > 0.0 and self.training:
+            dropout_mask = get_dropout_mask(
+                self.recurrent_dropout_probability, full_batch_previous_state)
+        else:
+            dropout_mask = None
+
+        for timestep in range(total_timesteps):
+            # The index depends on which end we start.
+            index = timestep if self.go_forward else \
+                total_timesteps - timestep - 1
+            # What we are doing here is finding the index into the batch
+            # dimension which we need to use for this timestep, because the
+            # sequences have variable length, so once the index is greater than
+            # the length of this particular batch sequence, we no longer need
+            # to do the computation for this sequence. The key thing to
+            # recognise here is that the batch inputs must be _ordered_ by
+            # length from longest (first in batch) to shortest (last) so
+            # initially, we are going forwards with every sequence and as we
+            # pass the index at which the shortest elements of the batch finish,
+            # we stop picking them up for the computation.
+            if self.go_forward:
+                while batch_lengths[current_length_index] <= index:
+                    current_length_index -= 1
+            # If we're going backwards, we are _picking up_ more indices.
+            else:
+                # First conditional: Are we already at the maximum number of
+                # elements in the batch?
+                # Second conditional: Does the next shortest sequence beyond
+                # the current batch index require computation use this timestep?
+                while (current_length_index < (len(batch_lengths) - 1)
+                       and batch_lengths[current_length_index + 1] > index):
+                    current_length_index += 1
+
+            # Actually get the slices of the batch which we
+            # need for the computation at this timestep.
+            # shape (batch_size, cell_size)
+            previous_memory = \
+                full_batch_previous_memory[0: current_length_index + 1].clone()
+            # Shape (batch_size, hidden_size)
+            previous_state = \
+                full_batch_previous_state[0: current_length_index + 1].clone()
+            # Shape (batch_size, input_size)
+            timestep_input = inputs[0: current_length_index + 1, index]
+
+            # Do the projections for all the gates all at once.
+            # Both have shape (batch_size, 4 * cell_size)
+            projected_input = self.input_linearity(timestep_input)
+            projected_state = self.state_linearity(previous_state)
+
+            # Main LSTM equations using relevant chunks of the big linear
+            # projections of the hidden state and inputs.
+            input_gate = torch.sigmoid(
+                projected_input[:, (0 * self.cell_size): (1 * self.cell_size)]
+                + projected_state[:, (0 * self.cell_size): (1 * self.cell_size)]
+            )
+            forget_gate = torch.sigmoid(
+                projected_input[:, (1 * self.cell_size): (2 * self.cell_size)]
+                + projected_state[:, (1 * self.cell_size): (2 * self.cell_size)]
+            )
+            memory_init = torch.tanh(
+                projected_input[:, (2 * self.cell_size): (3 * self.cell_size)]
+                + projected_state[:, (2 * self.cell_size): (3 * self.cell_size)]
+            )
+            output_gate = torch.sigmoid(
+                projected_input[:, (3 * self.cell_size): (4 * self.cell_size)]
+                + projected_state[:, (3 * self.cell_size): (4 * self.cell_size)]
+            )
+            memory = input_gate * memory_init + forget_gate * previous_memory
+
+            # Here is the non-standard part of this LSTM cell; first, we clip
+            # the memory cell, then we project the output of the timestep to a
+            # smaller size and again clip it.
+            if self.memory_cell_clip_value:
+                memory = torch.clamp(memory, -self.memory_cell_clip_value,
+                                     self.memory_cell_clip_value)
+
+            # shape (current_length_index, cell_size)
+            pre_projection_timestep_output = output_gate * torch.tanh(memory)
+
+            # shape (current_length_index, hidden_size)
+            timestep_output = self.state_projection(
+                pre_projection_timestep_output)
+            if self.state_projection_clip_value:
+                timestep_output = torch.clamp(
+                    timestep_output, -self.state_projection_clip_value,
+                    self.state_projection_clip_value)
+
+            # Only do dropout if the dropout prob is > 0.0 and we are in
+            # training mode.
+            if dropout_mask is not None:
+                timestep_output = \
+                    timestep_output * dropout_mask[0: current_length_index + 1]
+
+            # We've been doing computation with less than the full batch, so
+            # here we create a new variable for the the whole batch at this
+            # timestep and insert the result for the relevant elements of the
+            # batch into it.
+            full_batch_previous_memory = full_batch_previous_memory.clone()
+            full_batch_previous_state = full_batch_previous_state.clone()
+            full_batch_previous_memory[0: current_length_index + 1] = memory
+            full_batch_previous_state[0: current_length_index + 1] = \
+                timestep_output
+            output_accumulator[0: current_length_index + 1, index] = \
+                timestep_output
+
+        # Mimic the pytorch API by returning state in the following shape:
+        # (num_layers * num_directions, batch_size, ...). As this
+        # LSTM cell cannot be stacked, the first dimension here is just 1.
+        final_state = (full_batch_previous_state.unsqueeze(0),
+                       full_batch_previous_memory.unsqueeze(0))
+        return output_accumulator, final_state
+
+
+class Highway(torch.nn.Module):
+    r"""A [Highway layer](https://arxiv.org/abs/1505.00387) does a gated
+    combination of a linear transformation and a non-linear transformation of
+    its input. :math:`y = g * x + (1 - g) * f(A(x))`, where :math:`A` is a
+    linear transformation, :math:`f` is an element-wise non-linearity, and
+    :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
+
+    This module will apply a fixed number of highway layers to its input,
+    returning the final result.
+
+    Args:
+        input_dim: The dimensionality of :math:`x`. We assume the input has
+            shape `(batch_size, ..., input_dim)`.
+        num_layers: The number of highway layers to apply to the input.
+        activation: The non-linearity to use in the highway layers.
+    """
+
+    def __init__(self, input_dim: int, num_layers: int = 1,
+                 activation: Callable[[torch.Tensor], torch.Tensor] =
+                 torch.nn.functional.relu) -> None:
+        super().__init__()
+        self._input_dim = input_dim
+        self._layers = torch.nn.ModuleList(
+            [torch.nn.Linear(input_dim, input_dim * 2)
+             for _ in range(num_layers)])
+        self._activation = activation
+        for layer in self._layers:
+            # We should bias the highway layer to just carry its input forward.
+            # We do that by setting the bias on `B(x)` to be positive, because
+            # that means `g` will be biased to be high, so we will carry the
+            # input forward.  The bias on `B(x)` is the second half of the
+            # bias vector in each Linear layer.
+            layer.bias[input_dim:].data.fill_(1)  # type: ignore
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # type: ignore
+        current_input = inputs
+        for layer in self._layers:
+            projected_input = layer(current_input)
+            linear_part = current_input
+            # NOTE: if you modify this, think about whether you should modify
+            # the initialization above, too.
+            nonlinear_part, gate = projected_input.chunk(2, dim=-1)
+            nonlinear_part = self._activation(nonlinear_part)
+            gate = torch.sigmoid(gate)
+            current_input = gate * linear_part + (1 - gate) * nonlinear_part
+        return current_input
+
+
+class Embedding(torch.nn.Module):
+    r"""A more featureful embedding module than the default in Pytorch.  Adds
+    the ability to:
+
+        1. embed higher-order inputs
+        2. pre-specify the weight matrix
+        3. use a non-trainable embedding
+        4. project the resultant embeddings to some other dimension (which only
+           makes sense with non-trainable embeddings).
+
+    Args:
+        num_embeddings: Size of the dictionary of embeddings (vocabulary size).
+        embedding_dim: The size of each embedding vector.
+        projection_dim: If given, we add a projection layer after the embedding
+            layer. This really only makes sense if `trainable` is `False`.
+        weight: A pre-initialised weight matrix for the embedding lookup,
+            allowing the use of pre-trained vectors.
+        padding_index: If given, pads the output with zeros whenever it
+            encounters the index.
+        trainable: Whether or not to optimize the embedding parameters.
+        max_norm: If given, will renormalize the embeddings to always have a
+            norm lesser than this.
+        norm_type: The p of the p-norm to compute for the max_norm option.
+        scale_grad_by_freq: If given, this will scale gradients by the frequency
+            of the words in the mini-batch.
+        sparse: Whether or not the Pytorch backend should use a sparse
+            representation of the embedding weight.
+        vocab_namespace: In case of fine-tuning/transfer learning, the model's
+            embedding matrix needs to be extended according to the size of
+            extended-vocabulary. To be able to know how much to extend the
+            embedding-matrix, it's necessary to know which `vocab_namspace` was
+            used to construct it in the original training. We store
+            vocab_namespace used during the original training as an attribute,
+            so that it can be retrieved during fine-tuning.
+        pretrained_file: Used to keep track of what is the source of the weights
+            and loading more embeddings at test time. **It does not load the
+            weights from this pretrained_file.** For that purpose, use
+            `Embedding.from_params`.
+
+    Returns:
+        An Embedding module.
+    """
+
+    default_implementation = "embedding"
+
+    def __init__(self,
+                 num_embeddings: int, embedding_dim: int,
+                 projection_dim: Optional[int] = None,
+                 weight: Optional[torch.FloatTensor] = None,
+                 padding_index: Optional[int] = None, trainable: bool = True,
+                 max_norm: Optional[float] = None, norm_type: float = 2.0,
+                 scale_grad_by_freq: bool = False, sparse: bool = False,
+                 vocab_namespace: Optional[str] = None,
+                 pretrained_file: Optional[str] = None) -> None:
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.padding_index = padding_index
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        self._vocab_namespace = vocab_namespace
+        self._pretrained_file = pretrained_file
+        self.output_dim = projection_dim or embedding_dim
+
+        if weight is None:
+            weight = torch.FloatTensor(num_embeddings, embedding_dim)
+            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
+            torch.nn.init.xavier_uniform_(self.weight)
+        else:
+            if weight.size() != (num_embeddings, embedding_dim):
+                raise ValueError(
+                    "A weight matrix was passed with contradictory embedding "
+                    "shapes.")
+            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
+
+        if self.padding_index is not None:
+            self.weight.data[self.padding_index].fill_(0)
+
+        self._projection = None
+        if projection_dim:
+            self._projection = torch.nn.Linear(embedding_dim, projection_dim)
+
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:  # type: ignore
+        # tokens may have extra dimensions (batch_size, d1, ..., dn,
+        # sequence_length), but embedding expects (batch_size, sequence_length),
+        # so pass tokens to util.combine_initial_dims (which is a no-op if
+        # there are no extra dimensions). Remember the original size.
+        original_size = tokens.size()
+        tokens = combine_initial_dims(tokens)
+        embedded = embedding(
+            tokens, self.weight, padding_idx=self.padding_index,
+            max_norm=self.max_norm, norm_type=self.norm_type,
+            scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse)
+        # Now (if necessary) add back in the extra dimensions.
+        embedded = uncombine_initial_dims(embedded, original_size)
+
+        if self._projection:
+            projection = self._projection
+            for _ in range(embedded.dim() - 2):
+                projection = TimeDistributed(projection)  # type: ignore
+            embedded = projection(embedded)
+        return embedded
+
+
+class TimeDistributed(torch.nn.Module):
+    r"""Given an input shaped like `(batch_size, time_steps, [rest])` and a
+    `Module` that takes inputs like `(batch_size, [rest])`, `TimeDistributed`
+    reshapes the input to be `(batch_size * time_steps, [rest])`, applies the
+    contained `Module`, then reshapes it back.
+
+    Note that while the above gives shapes with `batch_size` first, this
+    `Module` also works if `batch_size` is second - we always just combine the
+    first two dimensions, then split them.
+
+    It also reshapes keyword arguments unless they are not tensors or their
+    name is specified in the optional `pass_through` iterable.
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        self._module = module
+
+    def forward(self, *inputs,
+                pass_through: Optional[List[str]] = None, **kwargs):
+        pass_through = pass_through or []
+        reshaped_inputs = [self._reshape_tensor(input_tensor)
+                           for input_tensor in inputs]
+        # Need some input to then get the batch_size and time_steps.
+        some_input = None
+        if inputs:
+            some_input = inputs[-1]
+        reshaped_kwargs = {}
+        for key, value in kwargs.items():
+            if isinstance(value, torch.Tensor) and key not in pass_through:
+                if some_input is None:
+                    some_input = value
+                value = self._reshape_tensor(value)
+            reshaped_kwargs[key] = value
+        reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)
+        if some_input is None:
+            raise RuntimeError("No input tensor to time-distribute")
+        # Now get the output back into the right shape.
+        # (batch_size, time_steps, **output_size)
+        new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
+        outputs = reshaped_outputs.contiguous().view(new_size)
+        return outputs
+
+    @staticmethod
+    def _reshape_tensor(input_tensor):
+        input_size = input_tensor.size()
+        if len(input_size) <= 2:
+            raise RuntimeError(f"No dimension to distribute: {input_size}")
+        # Squash batch_size and time_steps into a single axis; result has shape
+        # (batch_size * time_steps, **input_size).
+        squashed_shape = [-1] + list(input_size[2:])
+        return input_tensor.contiguous().view(*squashed_shape)
+
+
+class ScalarMix(torch.nn.Module):
+    r"""Computes a parameterised scalar mixture of N tensors,
+    `mixture = gamma * sum(s_k * tensor_k)` where `s = softmax(w)`, with `w`
+    and `gamma` scalar parameters.
+
+    In addition, if `do_layer_norm=True` then apply layer normalization to
+    each tensor before weighting.
+    """
+    def __init__(self, mixture_size: int, do_layer_norm: bool = False,
+                 initial_scalar_parameters: Optional[List[float]] = None,
+                 trainable: bool = True) -> None:
+        super().__init__()
+        self.mixture_size = mixture_size
+        self.do_layer_norm = do_layer_norm
+
+        if initial_scalar_parameters is None:
+            initial_scalar_parameters = [0.0] * mixture_size
+        elif len(initial_scalar_parameters) != mixture_size:
+            raise ValueError(
+                "Length of initial_scalar_parameters {} differs "
+                "from mixture_size {}".format(initial_scalar_parameters,
+                                              mixture_size))
+        self.scalar_parameters = ParameterList([
+            Parameter(torch.FloatTensor([initial_scalar_parameters[i]]),
+                      requires_grad=trainable) for i in range(mixture_size)])
+        self.gamma = Parameter(torch.FloatTensor([1.0]),
+                               requires_grad=trainable)
+
+    def forward(self, tensors: List[torch.Tensor],  # type: ignore
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        r"""Compute a weighted average of the `tensors`.  The input tensors can
+        be any shape with at least two dimensions, but must all be the same
+        shape.
+
+        When `do_layer_norm=True`, the `mask` is required input.  If the
+        `tensors` are dimensioned  `(dim_0, ..., dim_{n-1}, dim_n)`, then the
+        `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical
+        case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask`
+        of shape `(batch_size, timesteps)`.
+
+        When `do_layer_norm=False` the `mask` is ignored.
+        """
+        if len(tensors) != self.mixture_size:
+            raise ValueError(
+                "{} tensors were passed, but the module was initialized to "
+                "mix {} tensors.".format(len(tensors), self.mixture_size))
+
+        def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
+            tensor_masked = tensor * broadcast_mask
+            mean = torch.sum(tensor_masked) / num_elements_not_masked
+            variance = (
+                    torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) /
+                    num_elements_not_masked)
+            return (tensor - mean) / torch.sqrt(variance + 1e-12)
+
+        # pylint: disable=unnecessary-comprehension
+        normed_weights = torch.nn.functional.softmax(
+            torch.cat([parameter for parameter in self.scalar_parameters]),
+            dim=0)
+        normed_weights = torch.split(normed_weights, split_size_or_sections=1)
+
+        if not self.do_layer_norm:
+            pieces = []
+            for weight, tensor in zip(normed_weights, tensors):
+                pieces.append(weight * tensor)
+            return self.gamma * sum(pieces)
+        else:
+            assert mask is not None
+            mask_float = mask.float()
+            broadcast_mask = mask_float.unsqueeze(-1)
+            input_dim = tensors[0].size(-1)
+            num_elements_not_masked = torch.sum(mask_float) * input_dim
+            pieces = []
+            for weight, tensor in zip(normed_weights, tensors):
+                pieces.append(weight * _do_layer_norm(tensor, broadcast_mask,
+                                                      num_elements_not_masked))
+            return self.gamma * sum(pieces)
+
+
+def add_sentence_boundary_token_ids(
+        tensor: torch.Tensor, mask: torch.Tensor,
+        sentence_begin_token: Any, sentence_end_token: Any) -> \
+        Tuple[torch.Tensor, torch.Tensor]:
+    r"""Add begin/end of sentence tokens to the batch of sentences.
+    Given a batch of sentences with size `(batch_size, timesteps)` or
+    `(batch_size, timesteps, dim)` this returns a tensor of shape
+    `(batch_size, timesteps + 2)` or `(batch_size, timesteps + 2, dim)`
+    respectively.
+
+    Returns both the new tensor and updated mask.
+
+    Args:
+        tensor: A tensor of shape `(batch_size, timesteps)` or
+            `(batch_size, timesteps, dim)`.
+        mask: A tensor of shape `(batch_size, timesteps)`.
+        sentence_begin_token: For 2D input, a scalar with the <S> id.
+            For 3D input, a tensor with length dim.
+        sentence_end_token: For 2D input, a scalar with the </S> id.
+            For 3D input, a tensor with length dim.
+
+    Returns:
+        tensor_with_boundary_tokens: The tensor with the appended and prepended
+            boundary tokens. If the input was 2D, it has shape
+            `(batch_size, timesteps + 2)` and if the input was 3D, it has shape
+            `(batch_size, timesteps + 2, dim)`.
+        new_mask: The new mask for the tensor, taking into account the appended
+            tokens marking the beginning and end of the sentence.
+    """
+    sequence_lengths = mask.sum(dim=1).detach().cpu().numpy()
+    tensor_shape = list(tensor.data.shape)
+    new_shape = list(tensor_shape)
+    new_shape[1] = tensor_shape[1] + 2
+    tensor_with_boundary_tokens = tensor.new_zeros(*new_shape)
+    if len(tensor_shape) == 2:
+        tensor_with_boundary_tokens[:, 1:-1] = tensor
+        tensor_with_boundary_tokens[:, 0] = sentence_begin_token
+        for i, j in enumerate(sequence_lengths):
+            tensor_with_boundary_tokens[i, j + 1] = sentence_end_token
+        new_mask = (tensor_with_boundary_tokens != 0).long()
+    elif len(tensor_shape) == 3:
+        tensor_with_boundary_tokens[:, 1:-1, :] = tensor
+        for i, j in enumerate(sequence_lengths):
+            tensor_with_boundary_tokens[i, 0, :] = sentence_begin_token
+            tensor_with_boundary_tokens[i, j + 1, :] = sentence_end_token
+        new_mask = (
+                (tensor_with_boundary_tokens > 0).long().sum(dim=-1) > 0).long()
+    else:
+        raise ValueError(
+            "add_sentence_boundary_token_ids only accepts 2D and 3D input")
+    return tensor_with_boundary_tokens, new_mask
+
+
+def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \
+        Tuple[torch.Tensor, torch.Tensor]:
+    r"""Remove begin/end of sentence embeddings from the batch of sentences.
+    Given a batch of sentences with size `(batch_size, timesteps, dim)`
+    this returns a tensor of shape `(batch_size, timesteps - 2, dim)` after
+    removing the beginning and end sentence markers.  The sentences are
+    assumed to be padded on the right, with the beginning of each sentence
+    assumed to occur at index 0 (i.e., `mask[:, 0]` is assumed to be 1).
+
+    Returns both the new tensor and updated mask.
+
+    This function is the inverse of `add_sentence_boundary_token_ids`.
+
+    Args:
+        tensor: A tensor of shape `(batch_size, timesteps, dim)`.
+        mask: A tensor of shape `(batch_size, timesteps)`.
+
+    Returns:
+        tensor_without_boundary_tokens: The tensor after removing the boundary
+            tokens of shape `(batch_size, timesteps - 2, dim)`.
+        new_mask: The new mask for the tensor of shape
+            `(batch_size, timesteps - 2)`.
+    """
+    sequence_lengths = mask.sum(dim=1).detach().cpu().numpy()
+    tensor_shape = list(tensor.data.shape)
+    new_shape = list(tensor_shape)
+    new_shape[1] = tensor_shape[1] - 2
+    tensor_without_boundary_tokens = tensor.new_zeros(*new_shape)
+    new_mask = tensor.new_zeros((new_shape[0], new_shape[1]), dtype=torch.long)
+    for i, j in enumerate(sequence_lengths):
+        if j > 2:
+            tensor_without_boundary_tokens[i, : (j - 2), :] = \
+                tensor[i, 1: (j - 1), :]
+            new_mask[i, : (j - 2)] = 1
+    return tensor_without_boundary_tokens, new_mask
+
+
+def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int],
+                     gain: float = 1.0) -> None:
+    r"""An initializer which allows initializing model parameters in "blocks".
+    This is helpful in the case of recurrent models which use multiple gates
+    applied to linear projections, which can be computed efficiently if they
+    are concatenated together. However, they are separate parameters which
+    should be initialized independently.
+
+    Args:
+        tensor: A tensor to initialize.
+        split_sizes: A list of length `tensor.ndim()` specifying the size of the
+            blocks along that particular dimension. E.g. `[10, 20]` would
+            result in the tensor being split into chunks of size 10 along the
+            first dimension and 20 along the second.
+        gain: The gain (scaling) applied to the orthogonal initialization.
+    """
+    data = tensor.data
+    sizes = list(tensor.size())
+    if any(a % b != 0 for a, b in zip(sizes, split_sizes)):
+        raise ValueError(
+            "tensor dimensions must be divisible by their respective "
+            "split_sizes. Found size: {} and split_sizes: {}".format(
+                sizes, split_sizes))
+    indexes = [list(range(0, max_size, split)) for max_size, split in zip(
+        sizes, split_sizes)]
+    # Iterate over all possible blocks within the tensor.
+    for block_start_indices in itertools.product(*indexes):
+        # A list of tuples containing the index to start at for this block
+        # and the appropriate step size (i.e split_size[i] for dimension i).
+        index_and_step_tuples = zip(block_start_indices, split_sizes)
+        # This is a tuple of slices corresponding to:
+        # tensor[index: index + step_size, ...]. This is required because we
+        # could have an arbitrary number of dimensions. The actual slices we
+        # need are the start_index: start_index + step for each dimension in
+        # the tensor.
+        block_slice = tuple(
+            slice(start_index, start_index + step) for start_index, step in
+            index_and_step_tuples)
+        data[block_slice] = torch.nn.init.orthogonal_(
+            tensor[block_slice].contiguous(), gain=gain)
+
+
+def get_dropout_mask(dropout_probability: float,
+                     tensor_for_masking: torch.Tensor):
+    r"""Computes and returns an element-wise dropout mask for a given tensor,
+    where each element in the mask is dropped out with probability
+    dropout_probability. Note that the mask is NOT applied to the tensor -
+    the tensor is passed to retain the correct CUDA tensor type for the mask.
+
+    Args:
+        dropout_probability: Probability of dropping a dimension of the input.
+        tensor_for_masking: torch.Tensor, required.
+
+    Returns:
+        A torch.FloatTensor consisting of the binary mask scaled by
+        `1 / (1 - dropout_probability)`. This scaling ensures expected values
+        and variances of the output of applying this mask and the original
+        tensor are the same.
+    """
+    binary_mask = (
+            torch.rand(tensor_for_masking.size()) > dropout_probability).to(
+        tensor_for_masking.device)
+    # Scale mask by 1/keep_prob to preserve output statistics.
+    dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
+    return dropout_mask
diff --git a/texar/torch/modules/pretrained/elmo_utils_test.py b/texar/torch/modules/pretrained/elmo_utils_test.py
new file mode 100644
index 000000000..46bf680f5
--- /dev/null
+++ b/texar/torch/modules/pretrained/elmo_utils_test.py
@@ -0,0 +1,753 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for utils of ELMo modules.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/encoder_base_test.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/lstm_cell_with_projection_test.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/highway_test.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/time_distributed_test.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/initializers_test.py`
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/util_test.py`
+"""
+
+import unittest
+
+import h5py
+import json
+import numpy
+import tempfile
+import torch
+
+from numpy.testing import assert_array_almost_equal, assert_almost_equal
+from torch.nn import LSTM, RNN, Embedding, Module, Parameter
+
+from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids
+from texar.torch.data.data_utils import maybe_download
+from texar.torch.modules.pretrained.elmo_utils import (
+    Highway, LstmCellWithProjection, _EncoderBase, _ElmoBiLm, TimeDistributed,
+    remove_sentence_boundaries, add_sentence_boundary_token_ids,
+    block_orthogonal, ScalarMix)
+from texar.torch.utils.test import cuda_test
+from texar.torch.utils.utils import sort_batch_by_length
+
+
+class TestElmoBiLm(unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.options_file = maybe_download(
+            'https://github.com/allenai/allennlp/blob/master/allennlp/tests/'
+            'fixtures/elmo/options.json?raw=true',
+            self.tmp_dir.name)
+        self.weight_file = maybe_download(
+            'https://github.com/allenai/allennlp/blob/master/allennlp/tests/'
+            'fixtures/elmo/lm_weights.hdf5?raw=true',
+            self.tmp_dir.name)
+        self.sentences_json_file = maybe_download(
+            'https://github.com/allenai/allennlp/blob/master/allennlp/tests/'
+            'fixtures/elmo/sentences.json?raw=true',
+            self.tmp_dir.name)
+
+    def tearDown(self):
+        self.tmp_dir.cleanup()
+
+    def _load_sentences_embeddings(self):
+        r"""Load the test sentences and the expected LM embeddings.
+
+        These files loaded in this method were created with a batch-size of 3.
+        Due to idiosyncrasies with TensorFlow, the 30 sentences in
+        sentences.json are split into 3 files in which the k-th sentence in
+        each is from batch k.
+
+        This method returns a (sentences, embeddings) pair where each is a
+        list of length batch_size. Each list contains a sublist with
+        total_sentence_count / batch_size elements.  As with the original files,
+        the k-th element in the sublist is in batch k.
+        """
+        with open(self.sentences_json_file) as fin:
+            sentences = json.load(fin)
+
+        # the expected embeddings
+        expected_lm_embeddings = []
+        for k in range(len(sentences)):
+            embed_fname = maybe_download(
+                'https://github.com/allenai/allennlp/blob/master/allennlp/'
+                'tests/fixtures/elmo/lm_embeddings_{}.hdf5?raw=true'.format(k),
+                self.tmp_dir.name)
+            expected_lm_embeddings.append([])
+            with h5py.File(embed_fname, "r") as fin:
+                for i in range(10):
+                    sent_embeds = fin["%s" % i][...]
+                    sent_embeds_concat = numpy.concatenate(
+                        (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1)
+                    expected_lm_embeddings[-1].append(sent_embeds_concat)
+
+        return sentences, expected_lm_embeddings
+
+    def test_elmo_bilm(self):
+        # get the raw data
+        sentences, expected_lm_embeddings = self._load_sentences_embeddings()
+
+        # load the test model
+        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)
+
+        batches = [[sentences[j][i].split() for j in range(3)]
+                   for i in range(10)]
+
+        # Now finally we can iterate through batches.
+        for i, batch in enumerate(batches):
+            lm_embeddings = elmo_bilm(batch_to_ids(batch[:3]))
+            top_layer_embeddings, mask = remove_sentence_boundaries(
+                lm_embeddings["activations"][2], lm_embeddings["mask"])
+
+            # check the mask lengths
+            lengths = mask.data.numpy().sum(axis=1)
+            batch_sentences = [sentences[k][i] for k in range(3)]
+            expected_lengths = [len(sentence.split()) for sentence in
+                                batch_sentences]
+            self.assertEqual(lengths.tolist(), expected_lengths)
+
+            # get the expected embeddings and compare!
+            expected_top_layer = [expected_lm_embeddings[k][i] for k in
+                                  range(3)]
+            for k in range(3):
+                self.assertTrue(
+                    numpy.allclose(
+                        top_layer_embeddings[k, : lengths[k], :].data.numpy(),
+                        expected_top_layer[k],
+                        atol=1.0e-6,))
+
+
+class TestEncoderBase(unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.lstm = LSTM(
+            bidirectional=True, num_layers=3, input_size=3, hidden_size=7,
+            batch_first=True)
+        self.rnn = RNN(
+            bidirectional=True, num_layers=3, input_size=3, hidden_size=7,
+            batch_first=True)
+        self.encoder_base = _EncoderBase(stateful=True)
+
+        tensor = torch.rand([5, 7, 3])
+        tensor[1, 6:, :] = 0
+        tensor[3, 2:, :] = 0
+        self.tensor = tensor
+        mask = torch.ones(5, 7)
+        mask[1, 6:] = 0
+        mask[2, :] = 0  # <= completely masked
+        mask[3, 2:] = 0
+        mask[4, :] = 0  # <= completely masked
+        self.mask = mask
+
+        self.batch_size = 5
+        self.num_valid = 3
+        sequence_lengths = mask.long().sum(-1)
+        _, _, restoration_indices, sorting_indices = sort_batch_by_length(
+            tensor, sequence_lengths)
+        self.sorting_indices = sorting_indices
+        self.restoration_indices = restoration_indices
+
+    def test_non_stateful_states_are_sorted_correctly(self):
+        encoder_base = _EncoderBase(stateful=False)
+        initial_states = (torch.randn(6, 5, 7), torch.randn(6, 5, 7))
+        # Check that we sort the state for non-stateful encoders. To test
+        # we'll just use a "pass through" encoder, as we aren't actually testing
+        # the functionality of the encoder here anyway.
+        _, states, restoration_indices = encoder_base.sort_and_run_forward(
+            lambda *x: x, self.tensor, self.mask, initial_states)
+        # Our input tensor had 2 zero length sequences, so we need
+        # to concat a tensor of shape
+        # (num_layers * num_directions, batch_size - num_valid, hidden_dim),
+        # to the output before unsorting it.
+        zeros = torch.zeros([6, 2, 7])
+
+        # sort_and_run_forward strips fully-padded instances from the batch;
+        # in order to use the restoration_indices we need to add back the two
+        # that got stripped. What we get back should match what we started with.
+        for state, original in zip(states, initial_states):
+            assert list(state.size()) == [6, 3, 7]
+            state_with_zeros = torch.cat([state, zeros], 1)
+            unsorted_state = state_with_zeros.index_select(1,
+                                                           restoration_indices)
+            for index in [0, 1, 3]:
+                numpy.testing.assert_array_equal(
+                    unsorted_state[:, index, :].data.numpy(),
+                    original[:, index, :].data.numpy())
+
+    def test_get_initial_states(self):
+        # First time we call it, there should be no state, so we should return
+        # None.
+        assert (self.encoder_base._get_initial_states(
+            self.batch_size, self.num_valid, self.sorting_indices) is None)
+
+        # First test the case that the previous state is _smaller_ than the
+        # current state input.
+        initial_states = (torch.randn([1, 3, 7]), torch.randn([1, 3, 7]))
+        self.encoder_base._states = initial_states
+        # sorting indices are: [0, 1, 3, 2, 4]
+        returned_states = self.encoder_base._get_initial_states(
+            self.batch_size, self.num_valid, self.sorting_indices)
+
+        correct_expanded_states = [torch.cat([state, torch.zeros([1, 2, 7])], 1)
+                                   for state in initial_states]
+        # State should have been expanded with zeros to have shape
+        # (1, batch_size, hidden_size).
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[0].data.numpy(),
+            correct_expanded_states[0].data.numpy())
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[1].data.numpy(),
+            correct_expanded_states[1].data.numpy())
+
+        # The returned states should be of shape (1, num_valid, hidden_size) and
+        # they also should have been sorted with respect to the indices.
+        # sorting indices are: [0, 1, 3, 2, 4]
+
+        correct_returned_states = [
+            state.index_select(1, self.sorting_indices)[:, : self.num_valid, :]
+            for state in correct_expanded_states]
+
+        numpy.testing.assert_array_equal(
+            returned_states[0].data.numpy(),
+            correct_returned_states[0].data.numpy())
+        numpy.testing.assert_array_equal(
+            returned_states[1].data.numpy(),
+            correct_returned_states[1].data.numpy())
+
+        # Now test the case that the previous state is larger:
+        original_states = (torch.randn([1, 10, 7]), torch.randn([1, 10, 7]))
+        self.encoder_base._states = original_states
+        # sorting indices are: [0, 1, 3, 2, 4]
+        returned_states = self.encoder_base._get_initial_states(
+            self.batch_size, self.num_valid, self.sorting_indices)
+        # State should not have changed, as they were larger
+        # than the batch size of the requested states.
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[0].data.numpy(),
+            original_states[0].data.numpy())
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[1].data.numpy(),
+            original_states[1].data.numpy())
+
+        # The returned states should be of shape (1, num_valid, hidden_size)
+        # and they also should have been sorted with respect to the indices.
+        correct_returned_state = [
+            x.index_select(1, self.sorting_indices)[:, : self.num_valid, :]
+            for x in original_states]
+        numpy.testing.assert_array_equal(
+            returned_states[0].data.numpy(),
+            correct_returned_state[0].data.numpy())
+        numpy.testing.assert_array_equal(
+            returned_states[1].data.numpy(),
+            correct_returned_state[1].data.numpy())
+
+    def test_update_states(self):
+        assert self.encoder_base._states is None
+        initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7])
+
+        index_selected_initial_states = (
+            initial_states[0].index_select(1, self.restoration_indices),
+            initial_states[1].index_select(1, self.restoration_indices),)
+
+        self.encoder_base._update_states(initial_states,
+                                         self.restoration_indices)
+        # State was None, so the updated state should just be the sorted given
+        # state.
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[0].data.numpy(),
+            index_selected_initial_states[0].data.numpy())
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[1].data.numpy(),
+            index_selected_initial_states[1].data.numpy())
+
+        new_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7])
+        # tensor has 2 completely masked rows, so the last 2 rows of the _
+        # sorted_ states will be completely zero, having been appended after
+        # calling the respective encoder.
+        new_states[0][:, -2:, :] = 0
+        new_states[1][:, -2:, :] = 0
+
+        index_selected_new_states = (
+            new_states[0].index_select(1, self.restoration_indices),
+            new_states[1].index_select(1, self.restoration_indices),)
+
+        self.encoder_base._update_states(new_states, self.restoration_indices)
+        # Check that the update _preserved_ the state for the rows which were
+        # completely masked (2 and 4):
+        for index in [2, 4]:
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[0][:, index, :].data.numpy(),
+                index_selected_initial_states[0][:, index, :].data.numpy(),)
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[1][:, index, :].data.numpy(),
+                index_selected_initial_states[1][:, index, :].data.numpy(),)
+        # Now the states which were updated:
+        for index in [0, 1, 3]:
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[0][:, index, :].data.numpy(),
+                index_selected_new_states[0][:, index, :].data.numpy(),)
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[1][:, index, :].data.numpy(),
+                index_selected_new_states[1][:, index, :].data.numpy(),)
+
+        # Now test the case that the new state is smaller:
+        small_new_states = torch.randn([1, 3, 7]), torch.randn([1, 3, 7])
+        # pretend the 2nd sequence in the batch was fully masked.
+        small_restoration_indices = torch.LongTensor([2, 0, 1])
+        small_new_states[0][:, 0, :] = 0
+        small_new_states[1][:, 0, :] = 0
+
+        index_selected_small_states = (
+            small_new_states[0].index_select(1, small_restoration_indices),
+            small_new_states[1].index_select(1, small_restoration_indices),)
+        self.encoder_base._update_states(small_new_states,
+                                         small_restoration_indices)
+
+        # Check the index for the row we didn't update is the same as the
+        # previous step:
+        for index in [1, 3]:
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[0][:, index, :].data.numpy(),
+                index_selected_new_states[0][:, index, :].data.numpy(),)
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[1][:, index, :].data.numpy(),
+                index_selected_new_states[1][:, index, :].data.numpy(),)
+        # Indices we did update:
+        for index in [0, 2]:
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[0][:, index, :].data.numpy(),
+                index_selected_small_states[0][:, index, :].data.numpy(),)
+            numpy.testing.assert_array_equal(
+                self.encoder_base._states[1][:, index, :].data.numpy(),
+                index_selected_small_states[1][:, index, :].data.numpy(),)
+
+        # We didn't update index 4 in the previous step either, so it should
+        # be equal to the 4th index of initial states.
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[0][:, 4, :].data.numpy(),
+            index_selected_initial_states[0][:, 4, :].data.numpy(),)
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[1][:, 4, :].data.numpy(),
+            index_selected_initial_states[1][:, 4, :].data.numpy(),)
+
+    def test_reset_states(self):
+        # Initialize the encoder states.
+        assert self.encoder_base._states is None
+        initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7])
+        index_selected_initial_states = (
+            initial_states[0].index_select(1, self.restoration_indices),
+            initial_states[1].index_select(1, self.restoration_indices),)
+        self.encoder_base._update_states(initial_states,
+                                         self.restoration_indices)
+
+        # Check that only some of the states are reset when a mask is provided.
+        mask = torch.FloatTensor([1, 1, 0, 0, 0])
+        self.encoder_base.reset_states(mask)
+        # First two states should be zeros
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[0][:, :2, :].data.numpy(),
+            torch.zeros_like(initial_states[0])[:, :2, :].data.numpy(),)
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[1][:, :2, :].data.numpy(),
+            torch.zeros_like(initial_states[1])[:, :2, :].data.numpy(),)
+        # Remaining states should be the same
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[0][:, 2:, :].data.numpy(),
+            index_selected_initial_states[0][:, 2:, :].data.numpy(),)
+        numpy.testing.assert_array_equal(
+            self.encoder_base._states[1][:, 2:, :].data.numpy(),
+            index_selected_initial_states[1][:, 2:, :].data.numpy(),)
+
+        # Check that error is raised if mask has wrong batch size.
+        bad_mask = torch.FloatTensor([1, 1, 0])
+        with self.assertRaises(ValueError):
+            self.encoder_base.reset_states(bad_mask)
+
+        # Check that states are reset to None if no mask is provided.
+        self.encoder_base.reset_states()
+        assert self.encoder_base._states is None
+
+    def test_non_contiguous_initial_states_handled(self):
+        # Check that the encoder is robust to non-contiguous initial states.
+
+        # Case 1: Encoder is not stateful
+
+        # A transposition will make the tensors non-contiguous, start them off
+        # at the wrong shape and transpose them into the right shape.
+        encoder_base = _EncoderBase(stateful=False)
+        initial_states = (torch.randn(5, 6, 7).permute(1, 0, 2),
+                          torch.randn(5, 6, 7).permute(1, 0, 2),)
+        assert not initial_states[0].is_contiguous() and \
+               not initial_states[1].is_contiguous()
+        assert initial_states[0].size() == torch.Size([6, 5, 7])
+        assert initial_states[1].size() == torch.Size([6, 5, 7])
+
+        # We'll pass them through an LSTM encoder and a vanilla RNN encoder to
+        # make sure it works whether the initial states are a tuple of tensors
+        # or just a single tensor.
+        encoder_base.sort_and_run_forward(self.lstm, self.tensor,
+                                          self.mask, initial_states)
+        encoder_base.sort_and_run_forward(self.rnn, self.tensor,
+                                          self.mask, initial_states[0])
+
+        # Case 2: Encoder is stateful
+
+        # For stateful encoders, the initial state may be non-contiguous if
+        # its state was previously updated with non-contiguous tensors. As in
+        # the non-stateful tests, we check that the encoder still works on
+        # initial states for RNNs and LSTMs.
+        final_states = initial_states
+        # Check LSTM
+        encoder_base = _EncoderBase(stateful=True)
+        encoder_base._update_states(final_states, self.restoration_indices)
+        encoder_base.sort_and_run_forward(self.lstm, self.tensor, self.mask)
+        # Check RNN
+        encoder_base.reset_states()
+        encoder_base._update_states([final_states[0]], self.restoration_indices)
+        encoder_base.sort_and_run_forward(self.rnn, self.tensor, self.mask)
+
+    @cuda_test
+    def test_non_contiguous_initial_states_handled_on_gpu(self):
+        # Some PyTorch operations which produce contiguous tensors on the CPU
+        # produce non-contiguous tensors on the GPU (e.g. forward pass of an
+        # RNN when batch_first=True). Accordingly, we perform the same checks
+        # from previous test on the GPU to ensure the encoder is not affected
+        # by which device it is on.
+
+        # Case 1: Encoder is not stateful
+
+        # A transposition will make the tensors non-contiguous, start them off
+        # at the wrong shape and transpose them into the right shape.
+        encoder_base = _EncoderBase(stateful=False).cuda()
+        initial_states = (torch.randn(5, 6, 7).cuda().permute(1, 0, 2),
+                          torch.randn(5, 6, 7).cuda().permute(1, 0, 2),)
+        assert not initial_states[0].is_contiguous() and not initial_states[
+            1].is_contiguous()
+        assert initial_states[0].size() == torch.Size([6, 5, 7])
+        assert initial_states[1].size() == torch.Size([6, 5, 7])
+
+        # We'll pass them through an LSTM encoder and a vanilla RNN encoder to
+        # make sure it works whether the initial states are a tuple of tensors
+        # or just a single tensor.
+        encoder_base.sort_and_run_forward(
+            self.lstm.cuda(), self.tensor.cuda(), self.mask.cuda(),
+            initial_states)
+        encoder_base.sort_and_run_forward(
+            self.rnn.cuda(), self.tensor.cuda(), self.mask.cuda(),
+            initial_states[0])
+
+        # Case 2: Encoder is stateful
+
+        # For stateful encoders, the initial state may be non-contiguous if its
+        # state was previously updated with non-contiguous tensors. As in the
+        # non-stateful tests, we check that the encoder still works on initial
+        # states for RNNs and LSTMs.
+        final_states = initial_states
+        # Check LSTM
+        encoder_base = _EncoderBase(stateful=True).cuda()
+        encoder_base._update_states(final_states,
+                                    self.restoration_indices.cuda())
+        encoder_base.sort_and_run_forward(self.lstm.cuda(), self.tensor.cuda(),
+                                          self.mask.cuda())
+        # Check RNN
+        encoder_base.reset_states()
+        encoder_base._update_states([final_states[0]],
+                                    self.restoration_indices.cuda())
+        encoder_base.sort_and_run_forward(self.rnn.cuda(), self.tensor.cuda(),
+                                          self.mask.cuda())
+
+
+class TestHighway(unittest.TestCase):
+
+    def test_forward_works_on_simple_input(self):
+        highway = Highway(2, 2)
+
+        highway._layers[0].weight.data.fill_(1)
+        highway._layers[0].bias.data.fill_(0)
+        highway._layers[1].weight.data.fill_(2)
+        highway._layers[1].bias.data.fill_(-2)
+        input_tensor = torch.FloatTensor([[-2, 1], [3, -2]])
+        result = highway(input_tensor).data.numpy()
+        assert result.shape == (2, 2)
+        # This was checked by hand.
+        assert_almost_equal(result, [[-0.0394, 0.0197], [1.7527, -0.5550]],
+                            decimal=4)
+
+    def test_forward_works_on_nd_input(self):
+        highway = Highway(2, 2)
+        input_tensor = torch.ones(2, 2, 2)
+        output = highway(input_tensor)
+        assert output.size() == (2, 2, 2)
+
+
+class TestLstmCellWithProjection(unittest.TestCase):
+
+    def test_elmo_lstm_cell_completes_forward_pass(self):
+        input_tensor = torch.rand(4, 5, 3)
+        input_tensor[1, 4:, :] = 0.0
+        input_tensor[2, 2:, :] = 0.0
+        input_tensor[3, 1:, :] = 0.0
+
+        initial_hidden_state = torch.ones([1, 4, 5])
+        initial_memory_state = torch.ones([1, 4, 7])
+
+        lstm = LstmCellWithProjection(
+            input_size=3,
+            hidden_size=5,
+            cell_size=7,
+            memory_cell_clip_value=2,
+            state_projection_clip_value=1,)
+        output_sequence, lstm_state = lstm(
+            input_tensor, [5, 4, 2, 1], (initial_hidden_state,
+                                         initial_memory_state))
+        numpy.testing.assert_array_equal(
+            output_sequence.data[1, 4:, :].numpy(), 0.0)
+        numpy.testing.assert_array_equal(
+            output_sequence.data[2, 2:, :].numpy(), 0.0)
+        numpy.testing.assert_array_equal(
+            output_sequence.data[3, 1:, :].numpy(), 0.0)
+
+        # Test the state clipping.
+        numpy.testing.assert_array_less(output_sequence.data.numpy(), 1.0)
+        numpy.testing.assert_array_less(-output_sequence.data.numpy(), 1.0)
+
+        # LSTM state should be (num_layers, batch_size, hidden_size)
+        assert list(lstm_state[0].size()) == [1, 4, 5]
+        # LSTM memory cell should be (num_layers, batch_size, cell_size)
+        assert list((lstm_state[1].size())) == [1, 4, 7]
+
+        # Test the cell clipping.
+        numpy.testing.assert_array_less(lstm_state[0].data.numpy(), 2.0)
+        numpy.testing.assert_array_less(-lstm_state[0].data.numpy(), 2.0)
+
+
+class TestTimeDistributed(unittest.TestCase):
+
+    def test_time_distributed_reshapes_named_arg_correctly(self):
+        char_embedding = Embedding(2, 2)
+        char_embedding.weight = Parameter(
+            torch.FloatTensor([[0.4, 0.4], [0.5, 0.5]]))
+        distributed_embedding = TimeDistributed(char_embedding)
+        char_input = torch.LongTensor([[[1, 0], [1, 1]]])
+        output = distributed_embedding(char_input)
+        assert_almost_equal(
+            output.data.numpy(),
+            [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]])
+
+    def test_time_distributed_reshapes_positional_kwarg_correctly(self):
+        char_embedding = Embedding(2, 2)
+        char_embedding.weight = Parameter(torch.FloatTensor(
+            [[0.4, 0.4], [0.5, 0.5]]))
+        distributed_embedding = TimeDistributed(char_embedding)
+        char_input = torch.LongTensor([[[1, 0], [1, 1]]])
+        output = distributed_embedding(input=char_input)
+        assert_almost_equal(
+            output.data.numpy(),
+            [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]])
+
+    def test_time_distributed_works_with_multiple_inputs(self):
+        module = lambda x, y: x + y
+        distributed = TimeDistributed(module)
+        x_input = torch.LongTensor([[[1, 2], [3, 4]]])
+        y_input = torch.LongTensor([[[4, 2], [9, 1]]])
+        output = distributed(x_input, y_input)
+        assert_almost_equal(output.data.numpy(), [[[5, 4], [12, 5]]])
+
+    def test_time_distributed_reshapes_multiple_inputs_with_pass_through_tensor_correctly(self):
+
+        class FakeModule(Module):
+            def forward(self, input_tensor, tensor_to_pass_through=None,
+                        another_tensor=None):
+                return input_tensor + tensor_to_pass_through + another_tensor
+
+        module = FakeModule()
+        distributed_module = TimeDistributed(module)
+
+        input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]])
+        input_to_pass_through = torch.LongTensor([3, 7])
+        input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]])
+
+        output = distributed_module(
+            input_tensor1,
+            tensor_to_pass_through=input_to_pass_through,
+            another_tensor=input_tensor2,
+            pass_through=["tensor_to_pass_through"],)
+        assert_almost_equal(output.data.numpy(), [[[8, 11], [15, 12]]])
+
+    def test_time_distributed_reshapes_multiple_inputs_with_pass_through_non_tensor_correctly(self):
+
+        class FakeModule(Module):
+
+            def forward(self, input_tensor, number=0, another_tensor=None):
+
+                return input_tensor + number + another_tensor
+
+        module = FakeModule()
+        distributed_module = TimeDistributed(module)
+
+        input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]])
+        input_number = 5
+        input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]])
+
+        output = distributed_module(
+            input_tensor1,
+            number=input_number,
+            another_tensor=input_tensor2,
+            pass_through=["number"],)
+        assert_almost_equal(output.data.numpy(), [[[10, 9], [17, 10]]])
+
+
+class TestUtils(unittest.TestCase):
+
+    def test_add_sentence_boundary_token_ids_handles_2D_input(self):
+        tensor = torch.from_numpy(numpy.array([[1, 2, 3], [4, 5, 0]]))
+        mask = (tensor > 0).long()
+        bos = 9
+        eos = 10
+        new_tensor, new_mask = add_sentence_boundary_token_ids(
+            tensor, mask, bos, eos)
+        expected_new_tensor = numpy.array([[9, 1, 2, 3, 10], [9, 4, 5, 10, 0]])
+        assert (new_tensor.data.numpy() == expected_new_tensor).all()
+        assert (new_mask.data.numpy() == (expected_new_tensor > 0)).all()
+
+    def test_add_sentence_boundary_token_ids_handles_3D_input(self):
+        tensor = torch.from_numpy(
+            numpy.array([[[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]],
+                         [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]]]))
+        mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor)
+        bos = torch.from_numpy(numpy.array([9, 9, 9, 9]))
+        eos = torch.from_numpy(numpy.array([10, 10, 10, 10]))
+        new_tensor, new_mask = add_sentence_boundary_token_ids(
+            tensor, mask, bos, eos)
+        expected_new_tensor = numpy.array(
+            [[[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2],
+              [10, 10, 10, 10]],
+             [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10],
+              [0, 0, 0, 0]]])
+        assert (new_tensor.data.numpy() == expected_new_tensor).all()
+        assert (new_mask.data.numpy() == (
+                (expected_new_tensor > 0).sum(axis=-1) > 0)).all()
+
+    def test_remove_sentence_boundaries(self):
+        tensor = torch.from_numpy(numpy.random.rand(3, 5, 7))
+        mask = torch.from_numpy(
+            # The mask with two elements is to test the corner case
+            # of an empty sequence, so here we are removing boundaries
+            # from  "<S> </S>"
+            numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])
+        ).long()
+        new_tensor, new_mask = remove_sentence_boundaries(tensor, mask)
+
+        expected_new_tensor = torch.zeros(3, 3, 7)
+        expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :]
+        expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :]
+        assert_array_almost_equal(new_tensor.data.numpy(),
+                                  expected_new_tensor.data.numpy())
+
+        expected_new_mask = torch.from_numpy(numpy.array(
+            [[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long()
+        assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
+
+    def test_block_orthogonal_can_initialize(self):
+        tensor = torch.zeros([10, 6])
+        block_orthogonal(tensor, [5, 3])
+        tensor = tensor.data.numpy()
+
+        def test_block_is_orthogonal(block) -> None:
+            matrix_product = block.T @ block
+            numpy.testing.assert_array_almost_equal(
+                matrix_product, numpy.eye(matrix_product.shape[-1]), 6)
+
+        test_block_is_orthogonal(tensor[:5, :3])
+        test_block_is_orthogonal(tensor[:5, 3:])
+        test_block_is_orthogonal(tensor[5:, 3:])
+        test_block_is_orthogonal(tensor[5:, :3])
+
+    def test_block_orthogonal_raises_on_mismatching_dimensions(self):
+        tensor = torch.zeros([10, 6, 8])
+        with self.assertRaises(ValueError):
+            block_orthogonal(tensor, [7, 2, 1])
+
+
+class TestScalarMix(unittest.TestCase):
+
+    def test_scalar_mix_can_run_forward(self):
+        mixture = ScalarMix(3)
+        tensors = [torch.randn([3, 4, 5]) for _ in range(3)]
+        for k in range(3):
+            mixture.scalar_parameters[k].data[0] = 0.1 * (k + 1)
+        mixture.gamma.data[0] = 0.5
+        result = mixture(tensors)
+
+        weights = [0.1, 0.2, 0.3]
+        normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights))
+        expected_result = sum(normed_weights[k] * tensors[k].data.numpy()
+                              for k in range(3))
+        expected_result *= 0.5
+        numpy.testing.assert_almost_equal(expected_result, result.data.numpy())
+
+    def test_scalar_mix_throws_error_on_incorrect_number_of_inputs(self):
+        mixture = ScalarMix(3)
+        tensors = [torch.randn([3, 4, 5]) for _ in range(5)]
+        with self.assertRaises(ValueError):
+            _ = mixture(tensors)
+
+    def test_scalar_mix_throws_error_on_incorrect_initial_scalar_parameters_length(self):
+        with self.assertRaises(ValueError):
+            ScalarMix(3, initial_scalar_parameters=[0.0, 0.0])
+
+    def test_scalar_mix_trainable_with_initial_scalar_parameters(self):
+        initial_scalar_parameters = [1.0, 2.0, 3.0]
+        mixture = ScalarMix(3,
+                            initial_scalar_parameters=initial_scalar_parameters,
+                            trainable=False)
+        for i, scalar_mix_parameter in enumerate(mixture.scalar_parameters):
+            assert scalar_mix_parameter.requires_grad is False
+            assert scalar_mix_parameter.item() == initial_scalar_parameters[i]
+
+    def test_scalar_mix_layer_norm(self):
+        mixture = ScalarMix(3, do_layer_norm="scalar_norm_reg")
+
+        tensors = [torch.randn([3, 4, 5]) for _ in range(3)]
+        numpy_mask = numpy.ones((3, 4), dtype="int32")
+        numpy_mask[1, 2:] = 0
+        mask = torch.from_numpy(numpy_mask)
+
+        weights = [0.1, 0.2, 0.3]
+        for k in range(3):
+            mixture.scalar_parameters[k].data[0] = weights[k]
+        mixture.gamma.data[0] = 0.5
+        result = mixture(tensors, mask)
+
+        normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights))
+        expected_result = numpy.zeros((3, 4, 5))
+        for k in range(3):
+            mean = numpy.mean(tensors[k].data.numpy()[numpy_mask == 1])
+            std = numpy.std(tensors[k].data.numpy()[numpy_mask == 1])
+            normed_tensor = (tensors[k].data.numpy() - mean) / (std + 1e-12)
+            expected_result += normed_tensor * normed_weights[k]
+        expected_result *= 0.5
+
+        numpy.testing.assert_almost_equal(expected_result, result.data.numpy(),
+                                          decimal=6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/texar/torch/utils/test.py b/texar/torch/utils/test.py
index 26bdfe10e..50a28eb27 100644
--- a/texar/torch/utils/test.py
+++ b/texar/torch/utils/test.py
@@ -21,6 +21,7 @@
 __all__ = [
     "pretrained_test",
     "data_test",
+    "cuda_test",
     "external_library_test",
 ]
 
@@ -35,6 +36,8 @@ def define_skip_condition(flag: str, explanation: str):
     'TEST_PRETRAINED', "Test requires loading pre-trained checkpoints.")
 data_test = define_skip_condition(
     'TEST_DATA', "Test requires loading large data files.")
+cuda_test = define_skip_condition(
+    'TEST_CUDA', "Test requires cuda.")
 
 
 def external_library_test(name: str):
diff --git a/texar/torch/utils/utils.py b/texar/torch/utils/utils.py
index 426081587..bb71e76c8 100644
--- a/texar/torch/utils/utils.py
+++ b/texar/torch/utils/utils.py
@@ -19,10 +19,12 @@
 import copy
 import inspect
 from functools import lru_cache
+from itertools import islice
 from pydoc import locate
 from typing import (
-    Any, Callable, Collection, Dict, List, MutableMapping, Optional, Sequence,
-    Tuple, Type, TypeVar, Union, cast, no_type_check, overload)
+    Any, Callable, Collection, Dict, Iterable, Iterator, List, MutableMapping,
+    Optional, Sequence, Tuple, Type, TypeVar, Union, cast, no_type_check,
+    overload)
 
 import funcsigs
 import numpy as np
@@ -67,6 +69,11 @@
     'uniquify_str',
     'ceildiv',
     'sum_tensors',
+    'lazy_groups_of',
+    'sort_batch_by_length',
+    'get_device_of',
+    'combine_initial_dims',
+    'uncombine_initial_dims',
 ]
 
 T = TypeVar('T')  # type argument
@@ -1196,3 +1203,103 @@ def truncate_seq_pair(tokens_a: Union[List[int], List[str]],
             tokens_a.pop()
         else:
             tokens_b.pop()
+
+
+A = TypeVar("A")
+
+
+def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]:
+    r"""Takes an iterable and batches the individual instances into lists of the
+    specified size. The last list may be smaller if there are instances left
+    over.
+
+    Args:
+        iterable: An iterable object.
+        group_size: The group size.
+
+    Returns:
+        An iterator.
+    """
+    iterator = iter(iterable)
+    while True:
+        s = list(islice(iterator, group_size))
+        if len(s) > 0:
+            yield s
+        else:
+            break
+
+
+def sort_batch_by_length(tensor: torch.Tensor,
+                         sequence_lengths: torch.Tensor) -> \
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    r"""Sort a batch first tensor by some specified lengths.
+
+    Args:
+        tensor: A batch first tensor.
+        sequence_lengths: A tensor representing the lengths of some dimension of
+            the tensor which we want to sort by.
+
+    Returns:
+        sorted_tensor: The original tensor sorted along the batch dimension
+            with respect to `sequence_lengths`.
+        sorted_sequence_lengths: The original `sequence_lengths` sorted by
+            decreasing size.
+        restoration_indices: Indices into the `sorted_tensor` such that
+            ``sorted_tensor.index_select(0, restoration_indices) ==
+            original_tensor``
+        permutation_index: The indices used to sort the tensor. This is useful
+            if you want to sort many tensors using the same ordering.
+    """
+    if not isinstance(tensor, torch.Tensor) or \
+            not isinstance(sequence_lengths, torch.Tensor):
+        raise ValueError(
+            "Both the tensor and sequence lengths must be torch.Tensors.")
+
+    sorted_sequence_lengths, permutation_index = sequence_lengths.sort(
+        0, descending=True)
+    sorted_tensor = tensor.index_select(0, permutation_index)
+
+    index_range = torch.arange(0, len(sequence_lengths),
+                               device=sequence_lengths.device)
+    # This is the equivalent of zipping with index, sorting by the original
+    # sequence lengths and returning the now sorted indices.
+    _, reverse_mapping = permutation_index.sort(0, descending=False)
+    restoration_indices = index_range.index_select(0, reverse_mapping)
+    return (sorted_tensor, sorted_sequence_lengths, restoration_indices,
+            permutation_index)
+
+
+def get_device_of(tensor: torch.Tensor) -> int:
+    r"""Returns the device of the tensor.
+    """
+    if not tensor.is_cuda:
+        return -1
+    else:
+        return tensor.get_device()
+
+
+def combine_initial_dims(tensor: torch.Tensor) -> torch.Tensor:
+    r"""Given a (possibly higher order) tensor with shape
+    `[d1, ..., dn, sequence_length]` Return a view that's
+    `[d1 * ... * dn, sequence_length]`. If original tensor is 1-d or 2-d,
+    return it as is.
+    """
+    if tensor.dim() <= 2:
+        return tensor
+    else:
+        return tensor.view(-1, tensor.size(-1))
+
+
+def uncombine_initial_dims(tensor: torch.Tensor,
+                           original_size: torch.Size) -> torch.Tensor:
+    r"""Given a tensor of embeddings with shape
+    `[d1 * ... * dn, sequence_length, embedding_dim]` and the original shape
+    `[d1, ..., dn, sequence_length]`, return the reshaped tensor of embeddings
+    with shape `[d1, ..., dn, sequence_length, embedding_dim]`.
+    If original size is 1-d or 2-d, return it as is.
+    """
+    if len(original_size) <= 2:
+        return tensor
+    else:
+        view_args = list(original_size) + [tensor.size(-1)]
+        return tensor.view(*view_args)
diff --git a/texar/torch/utils/utils_test.py b/texar/torch/utils/utils_test.py
index 2eb543a77..190d95e62 100644
--- a/texar/torch/utils/utils_test.py
+++ b/texar/torch/utils/utils_test.py
@@ -195,25 +195,50 @@ def test_truncate_seq_pair(self):
         self.assertListEqual(tokens_a, [1])
         self.assertListEqual(tokens_b, [2, 3])
 
-    # def test_map_ids_to_strs(self):
-    #    """Tests :func:`texar.torch.utils.map_ids_to_strs`.
-    #    """
-    #    vocab_list = ['word', '词']
-    #    vocab_file = tempfile.NamedTemporaryFile()
-    #    vocab_file.write('\n'.join(vocab_list).encode("utf-8"))
-    #    vocab_file.flush()
-    #    vocab = Vocab(vocab_file.name)
-
-    #    text = [['<BOS>', 'word', '词', '<EOS>', '<PAD>'],
-    #            ['word', '词', 'word', '词', '<PAD>']]
-    #    text = np.asarray(text)
-    #    ids = vocab.map_tokens_to_ids_py(text)
-
-    #    ids = ids.tolist()
-    #    text_ = utils.map_ids_to_strs(ids, vocab)
-
-    #    self.assertEqual(text_[0], 'word 词')
-    #    self.assertEqual(text_[1], 'word 词 word 词')
+    def test_lazy_groups_of(self):
+        xs = [1, 2, 3, 4, 5, 6, 7]
+        groups = utils.lazy_groups_of(iter(xs), group_size=3)
+        assert next(groups) == [1, 2, 3]
+        assert next(groups) == [4, 5, 6]
+        assert next(groups) == [7]
+        with self.assertRaises(StopIteration):
+            _ = next(groups)
+
+    def test_sort_batch_by_length(self):
+        tensor = torch.rand([5, 7, 9])
+        tensor[0, 3:, :] = 0
+        tensor[1, 4:, :] = 0
+        tensor[2, 1:, :] = 0
+        tensor[3, 5:, :] = 0
+
+        sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7])
+        sorted_tensor, sorted_lengths, reverse_indices, _ = \
+            utils.sort_batch_by_length(tensor, sequence_lengths)
+
+        # Test sorted indices are padded correctly.
+        np.testing.assert_array_equal(sorted_tensor[1, 5:, :].data.numpy(), 0.0)
+        np.testing.assert_array_equal(sorted_tensor[2, 4:, :].data.numpy(), 0.0)
+        np.testing.assert_array_equal(sorted_tensor[3, 3:, :].data.numpy(), 0.0)
+        np.testing.assert_array_equal(sorted_tensor[4, 1:, :].data.numpy(), 0.0)
+
+        assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1]))
+
+        # Test restoration indices correctly recover the original tensor.
+        assert sorted_tensor.index_select(0, reverse_indices).data.equal(
+            tensor.data)
+
+    def test_combine_initial_dims(self):
+        tensor = torch.randn(4, 10, 20, 17, 5)
+
+        tensor2d = utils.combine_initial_dims(tensor)
+        assert list(tensor2d.size()) == [4 * 10 * 20 * 17, 5]
+
+    def test_uncombine_initial_dims(self):
+        embedding2d = torch.randn(4 * 10 * 20 * 17 * 5, 12)
+
+        embedding = utils.uncombine_initial_dims(embedding2d,
+                                                 torch.Size((4, 10, 20, 17, 5)))
+        assert list(embedding.size()) == [4, 10, 20, 17, 5, 12]
 
 
 if __name__ == "__main__":