diff --git a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py
index 34a8054a..f0d8fd01 100644
--- a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py
+++ b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py
@@ -126,6 +126,7 @@ def transform(self, corpus):
             corpus,
             "speaker",
             target_text_func=lambda utt: self._get_utt_row(utt, input_table).tokens,
+            smooth=False
         )
         self._set_output(corpus, input_table)
         return corpus
@@ -163,8 +164,7 @@ def _init_surprise(self, model_key_selector):
             surprise_attr_name=self.surprise_attr_name,
             target_sample_size=target_sample_size,
             context_sample_size=context_sample_size,
-            n_samples=n_samples,
-            smooth=False,
+            n_samples=n_samples
         )
 
     def _get_text_func(self, utt: Utterance, df: pd.DataFrame):
diff --git a/convokit/surprise/__init__.py b/convokit/surprise/__init__.py
index d6d19d0c..57568901 100644
--- a/convokit/surprise/__init__.py
+++ b/convokit/surprise/__init__.py
@@ -1 +1,15 @@
+import importlib.util
+import sys
+
+from .convokit_lm import *
+from .language_model import *
 from .surprise import *
+
+if "kenlm" in sys.modules:
+    from .kenlm import *
+elif (spec := importlib.util.find_spec("kenlm")) is not None:
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["kenlm"] = module
+    spec.loader.exec_module(module)
+
+    from .kenlm import *
diff --git a/convokit/surprise/convokit_lm.py b/convokit/surprise/convokit_lm.py
new file mode 100644
index 00000000..1b7ee5e5
--- /dev/null
+++ b/convokit/surprise/convokit_lm.py
@@ -0,0 +1,86 @@
+from collections import Counter
+from typing import Optional, Any, Union, List
+
+import numpy as np
+
+from .language_model import LanguageModel
+
+
+class ConvoKitLanguageModel(LanguageModel):
+    """A simple language model to compute the deviation of target from context.
+
+    This language model implements cross-entropy and perplexity language model evaluation functions,
+    to be used in evaluating the average deviation of target from the specified context.
+
+    :param model_type: The name (identifier) of the :py:class:`~convokit.ConvoKitLanguageModel`,
+        defaults to "convokit_lm". Note that the `model_type` can be accessed using the `type`
+        property (e.g., `lm.type`).
+    :param kwargs: Any additional keyword arguments needed in the language model evaluations. This
+        language model currently uses the following keyword arguments:
+
+        * `smooth`: Indicator of using Laplace smoothing in the computation of cross-entropy scores,
+          defaults to `True`.
+        * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized
+          with `joblib`, defaults to 1.
+
+        The language model configuration can be retrieved using the `config` property of the model
+        class object (e.g., `lm.config`).
+    """
+
+    def __init__(self, model_type: str = "convokit_lm", **kwargs: Optional[Any]):
+        super().__init__(model_type, **kwargs)
+
+        self._smooth = kwargs["smooth"] if "smooth" in kwargs else True
+
+    def cross_entropy(
+        self,
+        target: Union[List[str], np.ndarray],
+        context: Union[List[str], np.ndarray],
+    ) -> float:
+        r"""Implements the base class method to compute the cross-entropy.
+
+        Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. Note that we use the
+        natural logarithm; however, any base and corresponding exponent can be employed. For
+        instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference).
+
+        The smoothing boolean argument, `smooth`, is accessed from the setting in the language model
+        constructor (defaults to `True` when unspecified).
+
+        :param target: A list of tokens that make up the target text (P).
+        :param context: A list of tokens that make up the context text (Q).
+        :return: The cross-entropy score computed as :math:`H(P, Q)`.
+        """
+        n_target, n_context = len(target), len(context)
+        if min(n_target, n_context) == 0:
+            return np.nan
+
+        context_counts = Counter(context)
+        smooth_v = len(context_counts) + 1 if self._smooth else 0
+        smooth_k = 1 if self._smooth else 0
+        value = 0 if self._smooth else 1
+
+        return (
+            sum(
+                -np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v))
+                for token in target
+            )
+            / n_target
+        )
+
+    def perplexity(
+        self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]
+    ) -> float:
+        r"""Implements the base class method to compute perplexity.
+
+        Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. Note that
+        we use the natural logarithm; however, any base and corresponding exponent can be employed.
+        For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference).
+
+        For convenience, the perplexity score is computed as the exponentiation of the cross-entropy
+        calculated using the `cross_entropy()` method.
+
+        :param target: A list of tokens that make up the target text (P).
+        :param context: A list of tokens that make up the context text (Q).
+        :return: The perplexity score computed as :math:`\text{PPL}(P, Q)`.
+        """
+        return np.exp(self.cross_entropy(target, context))
diff --git a/convokit/surprise/demos/surprise_demo.ipynb b/convokit/surprise/demos/surprise_demo.ipynb
index 92946d6c..036fcdb5 100644
--- a/convokit/surprise/demos/surprise_demo.ipynb
+++ b/convokit/surprise/demos/surprise_demo.ipynb
@@ -11,17 +11,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import convokit\n",
     "import itertools\n",
+    "\n",
     "import numpy as np\n",
     "import spacy\n",
-    "from convokit import Corpus, download, Surprise\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "import convokit\n",
+    "from convokit import Corpus, download\n",
+    "from convokit import Surprise, ConvoKitLanguageModel, Kenlm\n",
     "from convokit.text_processing import TextProcessor, TextParser\n",
-    "from sklearn.feature_extraction.text import CountVectorizer"
+    "\n",
+    "from tqdm.notebook import tqdm\n",
+    "import pprint as pp"
    ]
   },
   {
@@ -35,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "tags": []
    },
@@ -44,7 +50,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Dataset already exists at /home/axl4/.convokit/downloads/subreddit-Cornell\n"
+      "Dataset already exists at /Users/tushaar/.convokit/downloads/subreddit-Cornell\n"
      ]
     }
    ],
@@ -54,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {
     "tags": []
    },
@@ -82,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -93,27 +99,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/axl4/ConvoKit/convokit/model/corpus.py:1213: FutureWarning: set_info() is deprecated and will be removed in a future release. Use add_meta() instead.\n",
-      "/home/axl4/ConvoKit/convokit/model/corpus.py:1219: FutureWarning: set_info() is deprecated and will be removed in a future release. Use add_meta() instead.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "corpus.organize_speaker_convo_history(utterance_filter=utterance_is_valid)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -122,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -213,7 +210,7 @@
        "Fencerman2             298.0"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -224,7 +221,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -233,19 +230,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "import itertools\n",
     "\n",
-    "subset_utts = [list(corpus.get_speaker(speaker).iter_utterances(selector=utterance_is_valid)) for speaker in top_speakers]\n",
+    "subset_utts = [list(corpus.get_speaker(speaker).iter_utterances(selector=utterance_is_valid)) \n",
+    "               for speaker in top_speakers]\n",
     "subset_corpus = Corpus(utterances=list(itertools.chain(*subset_utts)))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {
     "tags": []
    },
@@ -276,47 +274,43 @@
     "\n",
     "The transformer also has an optional `tokenizer` parameter to customize tokenization. Here we will tokenize the text outside of the surprise transformer, so our tokenizer will be an identity function.\n",
     "\n",
-    "The `smooth` parameter determines whether the transformer uses +1 laplace smoothing (`smooth = True`) or naively replaces 0 counts with 1's as the SpeakerConvoDiversity transformer does (`smooth = False`)."
+    "<!--The `smooth` parameter determines whether the transformer uses +1 laplace smoothing (`smooth = True`) or naively replaces 0 counts with 1's as the SpeakerConvoDiversity transformer does (`smooth = False`).-->"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import spacy\n",
     "\n",
     "spacy_nlp = spacy.load('en_core_web_sm', disable=['ner','parser', 'tagger', 'lemmatizer'])\n",
-    "for utt in subset_corpus.iter_utterances():\n",
+    "for utt in tqdm(subset_corpus.iter_utterances()):\n",
     "    utt.meta['joined_tokens'] = [t.text.lower() for t in spacy_nlp(utt.text)]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
-    "surp = Surprise(tokenizer=lambda x: x, model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), target_sample_size=100, context_sample_size=1000, n_samples=50, smooth=True)"
+    "surp = Surprise(tokenizer=lambda x: x, \n",
+    "                model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), \n",
+    "                target_sample_size=100, context_sample_size=1000, n_samples=50, n_jobs=8)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "fit1: 20550it [00:16, 1283.44it/s]\n",
-      "fit2: 100%|██████████| 15394/15394 [00:00<00:00, 1032033.56it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "surp = surp.fit(subset_corpus, text_func=lambda utt: [list(itertools.chain(*[u.meta['joined_tokens'] for u in utt.speaker.iter_utterances() if u.conversation_id != utt.conversation_id]))])"
+    "surp = surp.fit(subset_corpus, \n",
+    "                text_func=lambda utt: [list(itertools.chain(*[u.meta['joined_tokens'] \n",
+    "                                                              for u in utt.speaker.iter_utterances() \n",
+    "                                                              if u.conversation_id != utt.conversation_id]))])"
    ]
   },
   {
@@ -330,21 +324,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 19,
+   "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "transform: 100it [15:57,  9.57s/it]\n"
+      "{'model_type': 'convokit_lm', 'n_jobs': 8, 'smooth': True}\n"
      ]
     }
    ],
    "source": [
-    "transformed_corpus = surp.transform(subset_corpus, obj_type='speaker')"
+    "convokit_lm = ConvoKitLanguageModel(n_jobs=8)\n",
+    "pp.pprint(convokit_lm.config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'is_persistent': False,\n",
+      " 'kenlm_path': '/Users/tushaar/kenlm',\n",
+      " 'model_filename': 'kenlm_surprise',\n",
+      " 'model_type': 'kenlm',\n",
+      " 'models_dir': '/Users/tushaar/Desktop/kenlm_models',\n",
+      " 'n_jobs': 8,\n",
+      " 'ngram_order': 2}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Replace with appropriate paths to your kenlm directory\n",
+    "# and the folder to save the models.\n",
+    "kenlm = Kenlm(kenlm_path='/Users/tushaar/kenlm', \n",
+    "              models_dir='/Users/tushaar/Desktop/kenlm_models', \n",
+    "              model_filename='kenlm_surprise', \n",
+    "              n_jobs=8)\n",
+    "pp.pprint(kenlm.config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "transformed_corpus = surp.transform(subset_corpus, obj_type='speaker', \n",
+    "                                    language_model=convokit_lm, eval_type='cross_entropy')"
    ]
   },
   {
@@ -357,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -386,26 +420,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "EQUASHNZRKUL_815y6t        7.233156\n",
-       "SwissWatchesOnly_8g5q88    7.216094\n",
-       "SwissWatchesOnly_67cljd    7.129933\n",
-       "EQUASHNZRKUL_73xuw6        7.114335\n",
-       "Straight_Derpin_5kst5l     7.067594\n",
-       "laveritecestla_6v4ysm      7.066840\n",
-       "ClawofBeta_52u1nu          7.059744\n",
-       "Udontlikecake_7rj6a0       7.053087\n",
-       "syntheticity_97zg9z        7.041747\n",
-       "DEEP_THORAX_8drwet         7.038059\n",
+       "EQUASHNZRKUL_815y6t        7.258089\n",
+       "SwissWatchesOnly_8g5q88    7.199374\n",
+       "SwissWatchesOnly_67cljd    7.125212\n",
+       "EQUASHNZRKUL_73xuw6        7.100619\n",
+       "Udontlikecake_7rj6a0       7.083575\n",
+       "ClawofBeta_52u1nu          7.081842\n",
+       "Straight_Derpin_5kst5l     7.080008\n",
+       "syntheticity_97zg9z        7.055642\n",
+       "CornellMan333_9iwucv       7.043682\n",
+       "t3hasiangod_42k6wa         7.040483\n",
        "dtype: float64"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -424,26 +458,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Unga_Bunga_30ac0l         5.841967\n",
-       "Bisphosphate_7r8nu1       5.941750\n",
-       "crash_over-ride_6bjxnm    5.945221\n",
-       "crash_over-ride_8f7b0y    5.962945\n",
-       "crash_over-ride_7owfvv    5.963205\n",
-       "crash_over-ride_30zba1    5.970271\n",
-       "crash_over-ride_2vhtzx    5.970866\n",
-       "crash_over-ride_t6w01     5.981621\n",
-       "omgdonerkebab_v4a3p       5.981898\n",
-       "crash_over-ride_9b132c    5.983570\n",
+       "Unga_Bunga_30ac0l         5.849274\n",
+       "crash_over-ride_30zba1    5.937072\n",
+       "omgdonerkebab_v4a3p       5.944469\n",
+       "Bisphosphate_7r8nu1       5.960513\n",
+       "crash_over-ride_t6w01     5.962633\n",
+       "crash_over-ride_6bjxnm    5.967824\n",
+       "crash_over-ride_v4j70     5.980576\n",
+       "crash_over-ride_2vhtzx    5.982879\n",
+       "crash_over-ride_8f7b0y    5.990480\n",
+       "crash_over-ride_9b132c    6.002238\n",
        "dtype: float64"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -470,7 +504,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -484,7 +518,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.15"
   }
  },
  "nbformat": 4,
diff --git a/convokit/surprise/demos/tennis_demo.ipynb b/convokit/surprise/demos/tennis_demo.ipynb
index a5012807..f352404c 100644
--- a/convokit/surprise/demos/tennis_demo.ipynb
+++ b/convokit/surprise/demos/tennis_demo.ipynb
@@ -10,16 +10,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import convokit\n",
     "import json\n",
+    "\n",
     "import numpy as np\n",
     "from collections import defaultdict\n",
-    "from convokit import Corpus, Speaker, Utterance, download, Surprise\n",
-    "from tqdm import tqdm"
+    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
+    "\n",
+    "import convokit\n",
+    "from convokit import Surprise, ConvoKitLanguageModel, Kenlm\n",
+    "from convokit import Corpus, Speaker, Utterance, download\n",
+    "\n",
+    "from tqdm.notebook import tqdm\n",
+    "import pprint as pp"
    ]
   },
   {
@@ -32,17 +38,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "PATH = '/home/axl4' # replace with your path to tennis_data directory\n",
+    "PATH = '../../../../examples' # replace with your path to tennis_data directory\n",
     "data_dir = f'{PATH}/tennis_data/'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,15 +67,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 3962/3962 [00:00<00:00, 267184.91it/s]\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ef5f528cab74142aca4e45705e3e631",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3962 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -78,13 +91,14 @@
     "for c in tqdm(commentaries):\n",
     "    idx = 'c{}'.format(count)\n",
     "    meta = {'player_gender': c['gender'], 'scoreline': c['scoreline']}\n",
-    "    utterances.append(Utterance(id=idx, speaker=corpus_speakers['COMMENTATOR'], conversation_id=idx, text=c['commentary'], meta=meta))\n",
+    "    utterances.append(Utterance(id=idx, speaker=corpus_speakers['COMMENTATOR'], \n",
+    "                                conversation_id=idx, text=c['commentary'], meta=meta))\n",
     "    count += 1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,14 +115,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Dataset already exists at /home/axl4/.convokit/downloads/tennis-corpus\n"
+      "Dataset already exists at /Users/tushaar/.convokit/downloads/tennis-corpus\n"
      ]
     }
    ],
@@ -118,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -144,12 +158,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "for utt in interview_corpus.iter_utterances(selector=lambda u: u.meta['is_question']):\n",
-    "    utt.add_meta('player_gender', utt.get_conversation().get_utterance(utt.id.replace('q', 'a')).get_speaker().meta['gender'])"
+    "    utt.add_meta('player_gender', \n",
+    "                 utt.get_conversation().get_utterance(utt.id.replace('q', 'a')).get_speaker().meta['gender'])"
    ]
   },
   {
@@ -163,16 +178,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "from nltk import word_tokenize\n",
     "\n",
-    "def tokenizer(text):\n",
+    "def tokenizer_alnum(text):\n",
     "    return list(filter(lambda w: w.isalnum(), word_tokenize(text.lower())))\n",
     "\n",
-    "surp = Surprise(model_key_selector=lambda utt: 'corpus', tokenizer=tokenizer, target_sample_size=10, context_sample_size=None, n_samples=3)"
+    "def tokenizer_lower(text):\n",
+    "    tokens = []\n",
+    "    for sentence in sent_tokenize(text):\n",
+    "        tokens += (word_tokenize(sentence.lower()) + ['\\n'])\n",
+    "    return tokens\n",
+    "\n",
+    "surp = Surprise(model_key_selector=lambda utt: 'corpus', target_sample_size=10, tokenizer=tokenizer_lower,\n",
+    "                context_sample_size=None, n_samples=3, n_jobs=8)"
    ]
   },
   {
@@ -184,30 +206,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "fit1: 3962it [00:00, 842304.85it/s]\n",
-      "fit2: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<convokit.surprise.surprise.Surprise at 0x7fdcdc3aeb20>"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "surp.fit(game_commentary_corpus, text_func=lambda utt: [' '.join([u.text for u in game_commentary_corpus.iter_utterances()])])"
+    "surp = surp.fit(game_commentary_corpus, \n",
+    "                text_func=lambda utt: [' '.join([u.text for u in game_commentary_corpus.iter_utterances()])])"
    ]
   },
   {
@@ -219,16 +223,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
     "import itertools\n",
     "\n",
     "SAMPLE = True\n",
-    "SAMPLE_SIZE = 10000  # edit this to change the number of interview questions to calculate surprise for\n",
+    "SAMPLE_SIZE = 500  # edit this to change the number of interview questions to calculate surprise for\n",
     "\n",
-    "subset_utts = [interview_corpus.get_utterance(utt) for utt in interview_corpus.get_utterances_dataframe(selector=lambda utt: utt.meta['is_question']).sample(SAMPLE_SIZE).index]\n",
+    "subset_utts = \\\n",
+    "    [interview_corpus.get_utterance(utt)\n",
+    "     for utt in interview_corpus.get_utterances_dataframe(selector=lambda utt: \n",
+    "                                                          utt.meta['is_question']).sample(SAMPLE_SIZE).index]\n",
     "subset_corpus = Corpus(utterances=subset_utts) if SAMPLE else interview_corpus"
    ]
   },
@@ -241,29 +248,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "transform: 10000it [31:05,  5.36it/s]\n"
+      "{'is_persistent': True,\n",
+      " 'kenlm_path': '/Users/tushaar/kenlm',\n",
+      " 'model_filename': 'kenlm_surprise',\n",
+      " 'model_type': 'kenlm',\n",
+      " 'models_dir': '../../../../examples/kenlm_models',\n",
+      " 'n_jobs': 1,\n",
+      " 'ngram_order': 2}\n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<convokit.model.corpus.Corpus at 0x7fdd91004ee0>"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
-    "surp.transform(subset_corpus, obj_type='utterance', selector=lambda utt: utt.meta['is_question'])"
+    "# Replace with appropriate paths to your kenlm directory.\n",
+    "kenlm = Kenlm(kenlm_path='/Users/tushaar/kenlm', \n",
+    "              models_dir=f'{PATH}/kenlm_models', \n",
+    "              model_filename='kenlm_surprise', \n",
+    "              is_persistent=True)\n",
+    "pp.pprint(kenlm.config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subset_corpus = surp.transform(subset_corpus, obj_type='utterance',\n",
+    "                               selector=lambda utt: utt.meta['is_question'], \n",
+    "                               language_model=kenlm, eval_type='cross_entropy')"
    ]
   },
   {
@@ -276,7 +295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -285,45 +304,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "7.1372781396723255"
+       "36.832740783691406"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "import pandas as pd\n",
+    "get_scores = lambda utterances: pd.Series([score['corpus']for score in utterances], index=utterances.index)\n",
     "\n",
-    "female_qs = pd.to_numeric(utterances[utterances['meta.player_gender'] == 'F']['meta.surprise']).dropna()\n",
+    "female_qs = get_scores(utterances[utterances['meta.player_gender'] == 'F']['meta.surprise']).dropna()\n",
     "female_qs.median()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "7.147981123495766"
+       "37.093317667643234"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "male_qs = pd.to_numeric(utterances[utterances['meta.player_gender'] == 'M']['meta.surprise']).dropna()\n",
+    "male_qs = get_scores(utterances[utterances['meta.player_gender'] == 'M']['meta.surprise']).dropna()\n",
     "male_qs.median()"
    ]
   },
@@ -343,7 +363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -353,18 +373,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "And when was that in the match?  The first set?  Second set?\n",
-      "When she broke you in the eighth game of the third set, she did a backhand off the net and it kind of clipped the net and you kind of netted the next one. Was that just a tough break?\n",
-      "You started 3Love down in the first set. You came back and won it 64. What was the turnaround for you in the opening set and on through the match?\n",
-      "Would you give her a good chance against Stosur in the next round?\n",
-      "Do you enjoy the balance of the life as a tour player and then back home in and the ability to serve your country in the military?\n"
+      "What was it like on court? When did you sense that she was vulnerable, beatable?\n",
+      "Did you advise Serena not to play there?\n",
+      "Were you aware of Richard and Venus coming in for the third set? You seemed to look to them after a couple points and showed real emotion.\n",
+      "Can you describe how disappointed you are right now.\n",
+      "In the beginning, you were almost down 4Love. Why the slow start?\n"
      ]
     }
    ],
@@ -375,18 +395,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "And the second serve on the set point in the fourth set, just another day at the office?\n",
-      "Was it a big advantage to serve first in the third set?\n",
-      "But at the start of the third set again you had a little bit of a...\n",
-      "Speaking of the mental game, much is made of being the hunter or the hunted. For so long you were the hunted. This is the first week in a long time being the hunter. Is there a change at all in you?\n",
-      "How big of a deal was it get that break in the first game of the second set?\n"
+      "Why? What happened? What went wrong?\n",
+      "Is that your best result playing on grass? You had a good Wimbledon a couple years ago.\n",
+      "You seemed to have a good rhythm before the second rain break. What was the effect of the roof coming across? Did it feel different? Do you think it benefited him?\n",
+      "How is it different to play Roger on grass? You have played him on every surface. How is it different to play him here?\n",
+      "Why was he giving you so much trouble to start with?  Was the problem with you or him or both?\n"
      ]
     }
    ],
@@ -397,18 +417,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "No yoga, you prefer to dance? Some players do yoga.\n",
-      "What aspects of the match do you think were decisive, technically speaking?\n",
-      "Did you hear the birds?  They were really crying.  They were trapped and --\n",
-      "Did Sasha get an invitation to Kris Humphries' wedding this weekend?\n",
-      "Are you primarily based in Southern California or South Florida now?\n"
+      "The WTA has announces that next year your dad could come out with a tablet that shows stats. Do you think that your father would like having that access, and could you imagine what that would be like?\n",
+      "She's on an incredible run. Can you assess her as an upandcoming player? Is she somebody who can be an elite player in this game?\n",
+      "You've played Grand Slams before against players who were in their own country and been okay. Why the nerves? You've been in great form, haven't lost in a while.\n",
+      "When you look around the locker room you must feel like the senior citizen.\n",
+      "I forgot that you mentioned you're having your 27th birthday this fall.\n"
      ]
     }
    ],
@@ -419,18 +439,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Are you planning to play tactically against James or Mathieu tomorrow?\n",
-      "Did you consider yourself a streaky player even in college?\n",
-      "You said you watched Scream last night to relax. Do you normally watch horror films to relax?\n",
-      "How do you view your secondround matchup with Bernard Tomic?\n",
-      "Just talk us through the messages on your kit bag.\n"
+      "Your friend Tiger Woods was here in Doral last weekend. Did you get a chance to cross paths with him or talk to him prior to this tournament?\n",
+      "You've started the year at the Australian Open the last couple years and done very well. Do you feel like this is a Major tournament to you that will make a big difference in your career here this week?\n",
+      "You'll obviously go home with happy memories, despite what's happened today.\n",
+      "Have you changed anything in your clay court preparations from last year?\n",
+      "First match out here morning after a Davis Cup stint and traveling, how did you feel? Talk about the match.\n"
      ]
     }
    ],
@@ -450,39 +470,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
-    "gender_models_surp = Surprise(model_key_selector=lambda utt: utt.meta['player_gender'], target_sample_size=10, context_sample_size=5000, surprise_attr_name='surprise_gender_model')"
+    "gender_models_surp = Surprise(model_key_selector=lambda utt: utt.meta['player_gender'],\n",
+    "                              target_sample_size=10, context_sample_size=5000,\n",
+    "                              surprise_attr_name='surprise_gender_model')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "fit1: 81974it [00:00, 302952.81it/s]\n",
-      "fit2: 100%|██████████| 2/2 [00:12<00:00,  6.31s/it]\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3024b19405004768ba07bcd599b33e28",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "fit: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "489621b89e8b4e6e9766b690bd24f825",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "<convokit.surprise.surprise.Surprise at 0x7fdcf63e9d90>"
+       "fit:   0%|          | 0/2 [00:00<?, ?it/s]"
       ]
      },
-     "execution_count": 23,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "gender_models_surp.fit(interview_corpus, selector=lambda utt: utt.meta['is_question'])"
+    "gender_models_surp = gender_models_surp.fit(interview_corpus, \n",
+    "                                            selector=lambda utt: utt.meta['is_question'])"
    ]
   },
   {
@@ -496,29 +529,161 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "transform: 10000it [2:02:06,  1.36it/s]\n"
+      "{'model_type': 'convokit_lm', 'n_jobs': 1, 'smooth': True}\n"
      ]
+    }
+   ],
+   "source": [
+    "convokit_lm = ConvoKitLanguageModel(smooth=True)\n",
+    "pp.pprint(convokit_lm.config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1113e18e6f2a492b9a5de4c5c83874d0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "transform: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "<convokit.model.corpus.Corpus at 0x7fdd91004ee0>"
+       "surprise: 100%|##########| 1/1 [00:05<00:00,  5.21s/it]"
       ]
      },
-     "execution_count": 24,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.20s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.02s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.13s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.16s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.52s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.04s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "surprise: 100%|##########| 1/1 [00:02<00:00,  2.14s/it]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "gender_models_surp.transform(subset_corpus, obj_type='utterance', group_and_models=lambda utt: (utt.id, ['M', 'F']), group_model_attr_key=lambda _, m: m, selector=lambda utt: utt.meta['is_question'])"
+    "subset_corpus = \\\n",
+    "    gender_models_surp.transform(subset_corpus, obj_type='utterance', \n",
+    "                                 group_and_models=lambda utt: (utt.id, ['M', 'F']), \n",
+    "                                 group_model_attr_key=lambda _, m: m,\n",
+    "                                 selector=lambda utt: utt.meta['is_question'], \n",
+    "                                 language_model=convokit_lm, eval_type='cross_entropy')"
    ]
   },
   {
@@ -531,7 +696,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -540,82 +705,86 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "5.78670861966856"
+       "5.804742348431906"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "utterances[utterances['meta.player_gender'] == 'F']['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()"
+    "utterances[utterances['meta.player_gender'] == 'F'] \\\n",
+    "          ['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "5.7477053372750335"
+       "5.762531083154594"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "utterances[utterances['meta.player_gender'] == 'F']['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()"
+    "utterances[utterances['meta.player_gender'] == 'F'] \\\n",
+    "          ['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "5.784562889828235"
+       "5.7774629531902235"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "utterances[utterances['meta.player_gender'] == 'M']['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()"
+    "utterances[utterances['meta.player_gender'] == 'M'] \\\n",
+    "          ['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "5.81045743833415"
+       "5.820980867869622"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "utterances[utterances['meta.player_gender'] == 'M']['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()"
+    "utterances[utterances['meta.player_gender'] == 'M'] \\\n",
+    "          ['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()"
    ]
   },
   {
@@ -628,7 +797,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -642,7 +811,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.15"
   }
  },
  "nbformat": 4,
diff --git a/convokit/surprise/kenlm.py b/convokit/surprise/kenlm.py
new file mode 100644
index 00000000..23e71cfc
--- /dev/null
+++ b/convokit/surprise/kenlm.py
@@ -0,0 +1,246 @@
+import os
+import subprocess
+import time
+import warnings
+from pathlib import Path
+from typing import Optional, Any, Union, List, Tuple
+
+import numpy as np
+
+from convokit.util import create_temp_files, delete_files
+from .language_model import LanguageModel
+
+try:
+    import kenlm
+except (ModuleNotFoundError, ImportError):
+    raise ModuleNotFoundError(
+        "kenlm is not currently installed; run `pip install convokit[kenlm]` if you "
+        "would like to use the Kenlm language model. If kenlm installation fails, please "
+        "follow: https://github.com/kpu/kenlm/issues/57 to install kenlm."
+    )
+
+
+class Kenlm(LanguageModel):
+    """A language model to compute the deviation of target from context using KenLM.
+
+    Using KenLM library, this language model implements cross-entropy and perplexity language model
+    evaluation functions, to be used in evaluating the average deviation of target text from the
+    specified context.
+
+    Run `pip install convokit[kenlm]` to install the KenLM library before using this language model
+    class. If kenlm installation fails, please follow: https://github.com/kpu/kenlm/issues/57 to
+    install the KenLM library.
+
+    :param model_type: The name of the :py:class:`~convokit.Kenlm`, defaults to "kenlm". Note that
+        the `model_type` can be accessed using the `type` property (e.g., `lm.type`).
+    :param kwargs: Any additional keyword arguments needed in the language model evaluations. This
+        language model currently uses the following keyword arguments:
+
+        * `ngram_order`: The order of n-gram language model, when the specified `ngram_order` is
+          less than 2 (or unspecified), the `ngram_order` is set to 2, since the KenLM library does
+          not support n-gram order below 2 (see: https://github.com/kpu/kenlm/issues/171).
+        * `trained_model_filepath`: The filepath to a pre-trained language model that is to be
+          persistently used.
+        * `is_persistent`: Indicator of model persistence, i.e., the model generated in the first
+          pass or that loaded from `trained_model_filepath` is used in all evaluations. When the
+          `trained_model_filepath` is specified, persistence is implied. Defaults to `False`.
+        * `kenlm_path`: The path to the KenLM library, defaults to the user's home directory.
+        * `models_dir`: The folder path to store the (trained) binary KenLM models, defaults to
+          `None`, indicating that the trained KenLM models need not be stored.
+        * `model_filename`: The filename used in storing model artefacts, defaults to `model_type`.
+        * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized
+          with `joblib`, defaults to 1.
+
+        The language model configuration can be retrieved using the `config` property of the model
+        class object (e.g., `lm.config`).
+    """
+
+    def __init__(self, model_type: str = "kenlm", **kwargs: Optional[Any]):
+        super().__init__(model_type, **kwargs)
+
+        self._ngram_order = kwargs["ngram_order"] if "ngram_order" in kwargs else 2
+        if self._ngram_order < 2:
+            warnings.warn(
+                f"kenlm does not support n-gram order below 2; setting n-gram order to 2. "
+                f"See: https://github.com/kpu/kenlm/issues/171 for specifics."
+            )
+            self._ngram_order = 2
+
+        self._is_persistent = kwargs["is_persistent"] if "is_persistent" in kwargs else False
+        if self._is_persistent or "trained_model_filepath" in kwargs:
+            self._is_persistent = True
+            self.__kenlm_model = (
+                Kenlm.load_kenlm_from_file(kwargs["trained_model_filepath"])
+                if "trained_model_filepath" in kwargs
+                else None
+            )
+
+        if "kenlm_path" not in kwargs:
+            self._kenlm_path = os.path.join(str(Path.home()), "kenlm")
+            warnings.warn(f"the kenlm_path is unspecified, setting it to {self._kenlm_path}")
+        self.__kenlm_bin_path = os.path.join(self._kenlm_path, "build/bin")
+        if not os.path.isdir(self.__kenlm_bin_path):
+            raise FileNotFoundError(
+                f"the build directory for kenlm does not exist at: {self.__kenlm_bin_path}; "
+                f"build kenlm {self._kenlm_path} before computing surprise scores"
+            )
+
+        self._models_dir = kwargs["models_dir"] if "models_dir" in kwargs else None
+        if self._models_dir and not os.path.exists(self._models_dir):
+            warnings.warn(f"creating the folder: {self._models_dir} as it does not exist")
+            os.makedirs(self._models_dir)
+        self._model_filename = (
+            kwargs["model_filename"] if "model_filename" in kwargs else self._model_type
+        )
+
+    @staticmethod
+    def load_kenlm_from_file(trained_model_filepath: str) -> kenlm.Model:
+        """Loads the pre-trained KenLM model from the specified filepath.
+
+        :param trained_model_filepath: The path to the pre-trained KenLM model.
+        :return: The loaded KenLM model.
+        """
+        kenlm_model = kenlm.Model(trained_model_filepath)
+        return kenlm_model
+
+    def __make_files(self) -> Tuple[str, str, str]:
+        """Create (if needed) and return the filenames of intermittent files.
+
+        KenLM language model needs the training data filename, .arpa filename, and the binary model
+        filename to generate a KenLM model. If the models are not stored (specified through the
+        argument `models_dir` in the constructor), `tempfile` files are used, else, all the files
+        are generated in the `models_dir/current_timestamp` folder, using the filename specified in
+        the constructor.
+
+        :return: A tuple of filenames of all the intermittent files needed.
+        """
+        if self._models_dir:
+            epoch = str(int(time.time()))
+            os.makedirs(os.path.join(self._models_dir, epoch))
+
+            train_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.txt")
+            arpa_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.arpa")
+            model_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.bin")
+        else:
+            train_file, arpa_file, model_file = create_temp_files(num_files=3)
+            train_filename, arpa_filename, model_filename = (
+                train_file.name,
+                arpa_file.name,
+                model_file.name,
+            )
+        return train_filename, arpa_filename, model_filename
+
+    @staticmethod
+    def __populate_train_file(filepath: str, samples: Union[List[List[str]], np.ndarray]):
+        """Writes the specified samples to a file, to be used in KenLM training.
+
+        :param filepath: The filepath to write the samples to.
+        :param samples: The samples that are to be written to the file. Each list of samples is
+            delimited using a newline (`\n`).
+        """
+        with open(filepath, "w", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(f'{" ".join(sample).strip()}\n')
+
+    def _get_kenlm_model(self, context_samples: Union[List[List[str]], np.ndarray]) -> kenlm.Model:
+        """Retrieve the KenLM model trained using the specified `context_samples`.
+
+        This method generates the training file using the `context_samples`, which is then used in
+        the generation of the .arpa and a binary KenLM trained model files. These intermittent files
+        are deleted, unless the specified value of `models_dir` is not `None`, indicating that the
+        models are to be stored.
+
+        :param context_samples: The context samples to be used in training the KenLM model.
+        :return: The KenLM model trained on the specified `context_samples`.
+        """
+        train_filename, arpa_filename, model_filename = self.__make_files()
+
+        self.__populate_train_file(train_filename, samples=context_samples)
+        kenlm_args = [
+            os.path.join(self.__kenlm_bin_path, "lmplz"),
+            "-o",
+            f"{self._ngram_order}",
+            "--text",
+            train_filename,
+            "--arpa",
+            arpa_filename,
+            "--discount_fallback",
+        ]
+        cmd_return = subprocess.run(
+            kenlm_args,
+            capture_output=False,
+            text=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT,
+        )
+        if cmd_return.returncode != 0:
+            delete_files([model_filename, arpa_filename, train_filename])
+            raise RuntimeError("the kenlm model training was unsuccessful")
+
+        kenlm_args = [
+            os.path.join(self.__kenlm_bin_path, "build_binary"),
+            "trie",
+            arpa_filename,
+            model_filename,
+        ]
+        cmd_return = subprocess.run(
+            kenlm_args,
+            capture_output=False,
+            text=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT,
+        )
+        if cmd_return.returncode != 0:
+            delete_files([model_filename, arpa_filename, train_filename])
+            raise RuntimeError("the kenlm model (binary) building was unsuccessful")
+
+        kenlm_model = kenlm.Model(model_filename)
+        if not self._models_dir:
+            delete_files([model_filename, arpa_filename, train_filename])
+
+        return kenlm_model
+
+    def cross_entropy(
+        self,
+        target: Union[List[str], np.ndarray],
+        context: Union[List[str], np.ndarray],
+    ) -> float:
+        """Implements the base class method to compute the cross-entropy.
+
+        A KenLM model is trained using the specified `context`, and is used to evaluate the `target`
+        text. Note that, if model persistence is indicated in the constructor (using the argument
+        `is_persistent`), the model generated in the first pass or that loaded from the parameter
+        value of `trained_model_filepath` is used in all evaluations. (When `trained_model_filepath`
+        is specified, persistence is automatically implied.)
+
+        The KenLM library returns a score of log-probabilities (when `score()` method is used), and
+        the cross-entropy is the negative log-likelihood.
+
+        :param target: A list of tokens that make up the target text (P).
+        :param context: A list of tokens that make up the context text (Q), used to train the model.
+        :return: The cross-entropy score computed using the `kenlm.score()` method.
+        """
+        if self.__kenlm_model is None or not self._is_persistent:
+            self.__kenlm_model = self._get_kenlm_model([context])
+        return -self.__kenlm_model.score(" ".join(target).strip())
+
+    def perplexity(
+        self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]
+    ) -> float:
+        """Implements the base class method to compute perplexity.
+
+        A KenLM model is trained using the specified `context`, and is used to evaluate the `target`
+        text. Note that, if model persistence is indicated in the constructor (using the argument
+        `is_persistent`), the model generated in the first pass or that loaded from the parameter
+        value of `trained_model_filepath` is used in all evaluations. (When `trained_model_filepath`
+        is specified, persistence is automatically implied.)
+
+        The KenLM library returns a perplexity score, with the use of `kenlm.perplexity()` method.
+
+        :param target: A list of tokens that make up the target text (P).
+        :param context: A list of tokens that make up the context text (Q), used to train the model.
+        :return: The perplexity score computed using the `kenlm.perplexity()` method.
+        """
+        if self.__kenlm_model is None or not self._is_persistent:
+            self.__kenlm_model = self._get_kenlm_model([context])
+        return self.__kenlm_model.perplexity(" ".join(target).strip())
diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py
new file mode 100644
index 00000000..d931cce4
--- /dev/null
+++ b/convokit/surprise/language_model.py
@@ -0,0 +1,167 @@
+from abc import ABC
+from typing import Optional, Any, List, Dict, Union, final
+
+import numpy as np
+from joblib import Parallel, delayed
+
+
+class LanguageModel(ABC):
+    """The abstract base class for all language models.
+
+    The language model base class defines the :py:meth:`~convokit.LanguageModel.evaluate` method,
+    which performs language model evaluation using the `eval_type` specified as an argument to the
+    :py:meth:`~convokit.LanguageModel.evaluate` method. Note that this method must be defined and
+    implemented in the subclass (e.g., if the `eval_type` is set to "cross_entropy", the subclass
+    must implement :py:meth:`~convokit.LanguageModel.cross_entropy` method). The implemented method
+    should take in a list of target tokens and a list of context tokens, and output the language
+    model evaluation score.
+
+    Since most language models employs cross-entropy and perplexity evaluations, this base class
+    includes unimplemented designs of :py:meth:`~convokit.LanguageModel.cross_entropy` and
+    :py:meth:`~convokit.LanguageModel.perplexity` functions, which may be implemented (as needed) in
+    the subclasses. See the subclass implementations: :py:class:`~convokit.ConvoKitLanguageModel`
+    and :py:class:`~convokit.Kenlm` classes, which extend this base class.
+
+    The :py:meth:`~convokit.LanguageModel.evaluate` method defined in this class is called on a set
+    of context samples and a set of target samples, and evaluates the target-context distribution
+    deviations using the `eval_type` language model evaluation function.
+
+    Note: The subclasses cannot override the :py:meth:`~convokit.LanguageModel.evaluate` method.
+
+    :param model_type: The name (identifier) of :py:class:`~convokit.LanguageModel`, defaults to
+        "language_model". Note that the `model_type` can be accessed using the `type` property
+        (e.g., `lm.type`).
+    :param kwargs: Any additional keyword arguments needed in the language model evaluations. For
+        instance, the cross-entropy computes might require smoothing parameter; hence, a `smooth`
+        parameter can be passed as an additional keyword argument.
+        Another keyword argument is `n_jobs`, used to specify the number of concurrent threads to be
+        used for routines that are parallelized with `joblib`, defaults to 1.
+        The language model configuration can be retrieved using the `config` property of the model
+        class object (e.g., `lm.config`).
+    """
+
+    def __init__(self, model_type: str = "language_model", **kwargs: Optional[Any]):
+        self._model_type = model_type
+        self._n_jobs = kwargs["n_jobs"] if "n_jobs" in kwargs else 1
+
+        self.__dict__.update((f"_{arg}", value) for arg, value in kwargs.items())
+
+    @property
+    def type(self) -> str:
+        """The `model_type` property of the language model.
+
+        :return: The `model_type` specified in the class constructor, defaults to "language_model".
+        """
+        return self._model_type
+
+    @property
+    def config(self) -> Dict[str, Any]:
+        """The configuration (all the class parameters) of the language model.
+
+        :return: The configuration (all the class parameters specified in the class constructor and
+            elsewhere) of the language model.
+        """
+        private_var_prefix = f"_{self.__class__.__name__}"
+        return {
+            arg[1:]: value
+            for arg, value in self.__dict__.items()
+            if not arg.startswith(private_var_prefix)
+        }
+
+    def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]):
+        """Overwrites the class variables with the values specified in `kwargs`.
+
+        :param args_to_overwrite: The list of arguments (class variable names) whose values are to
+            be overwritten using the values in the `kwargs`.
+        :param kwargs: The keyword arguments with updates to the values of the class variables.
+        """
+        for arg in args_to_overwrite:
+            self.__dict__[f"_{arg}"] = kwargs[arg] if arg in kwargs else self.__dict__[f"_{arg}"]
+
+    def cross_entropy(
+        self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]
+    ) -> float:
+        r"""An unimplemented base class method to compute the cross-entropy.
+
+        The cross-entropy between a list of target tokens and a list of context tokens is to be
+        computed by the implementation in the subclass. Note that any variables to be used in this
+        method (e.g., smoothing value) must be accessed from the class scope.
+
+        Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`.
+
+        Note that we use the natural logarithm; however, any base and corresponding exponent can be
+        employed. For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference).
+
+        :param target: A list of tokens that make up the target text (P).
+        :param context: A list of tokens that make up the context text (Q).
+        :raises: Raises a `RuntimeError` if called without implementing it in the subclass.
+        """
+        raise RuntimeError("cross entropy is not implemented")
+
+    def perplexity(
+        self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray]
+    ) -> float:
+        r"""An unimplemented base class method to compute perplexity.
+
+        The perplexity between a list of target tokens and a list of context tokens is to be
+        computed by the implementation in the subclass. Note that any variables to be used in this
+        method (e.g., smoothing value) must be accessed from the class scope.
+
+        Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`.
+
+        Note that we use the natural logarithm; however, any base and corresponding exponent can be
+        employed. For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference).
+
+        :param target: A list of tokens that make up the target text (P).
+        :param context: A list of tokens that make up the context text (Q).
+        :raises: Raises a `RuntimeError` if called without implementing it in the subclass.
+        """
+        raise RuntimeError("perplexity is not implemented")
+
+    @final
+    def evaluate(
+        self,
+        target_samples: Union[List[List[str]], np.ndarray],
+        context_samples: Union[List[List[str]], np.ndarray],
+        eval_type: str = "cross_entropy",
+        **kwargs: Optional[Any],
+    ) -> np.ndarray:
+        """Computes the average deviation between target and context distributions.
+
+        For a given list of (fixed size) target sample lists and (fixed size) context sample lists,
+        the :py:meth:`~convokit.LanguageModel.evaluate` method computes the deviation between each
+        target and corresponding context pair, using `eval_type` language model evaluation metric.
+        Note that the subclass implementing this abstract base class must define and implement the
+        `eval_type` evaluation method. The final score output by this method is an average of all
+        the individual scores.
+
+        Also note that, if specified as keyword arguments, any class variable values are overwritten
+        from within this method.
+
+        :param target_samples: A list of target sample lists to be used to evaluate against the
+            corresponding context sample lists.
+        :param context_samples: A list of context sample lists that are to be used in evaluating the
+            corresponding target sample lists.
+        :param eval_type: The language model evaluation function (as `str`), used in evaluating the
+            language model trained using the context text, evaluated using the target text. Defaults
+            to "cross_entropy", i.e., calls the :py:meth:`~convokit.LanguageModel.cross_entropy`
+            method.
+        :param kwargs: Any additional keyword arguments needed in the language model evaluations. If
+            any class variables are passed using `kwargs`, the corresponding class variable values
+            are overwritten using the new values.
+        :return: The average score that measures the average deviation of target text from context.
+        """
+        self._overwrite_args(list(kwargs.keys()), kwargs)
+        eval_fn = getattr(self, eval_type)
+
+        if self._n_jobs == 1:
+            model_scores = [
+                eval_fn(target_sample, context_sample)
+                for target_sample, context_sample in zip(target_samples, context_samples)
+            ]
+        else:
+            model_scores = Parallel(n_jobs=self._n_jobs, backend="threading")(
+                delayed(eval_fn)(target_sample, context_sample)
+                for target_sample, context_sample in zip(target_samples, context_samples)
+            )
+        return np.nanmean(model_scores)
diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py
index b03ce3de..73e187d1 100644
--- a/convokit/surprise/surprise.py
+++ b/convokit/surprise/surprise.py
@@ -1,272 +1,458 @@
-import numpy as np
-from collections import defaultdict, Counter
-from convokit import Transformer
-from convokit.model import Corpus, CorpusComponent, Utterance
+import warnings
+from collections import defaultdict
 from itertools import chain
+from typing import Callable, List, Tuple, Dict, Any, Optional, Union, Set
+
+import numpy as np
+from IPython import get_ipython
+from joblib import Parallel, delayed
 from nltk.tokenize import word_tokenize
-from sklearn.feature_extraction.text import CountVectorizer
 from tqdm import tqdm
-from typing import Callable, List, Tuple, Union
-
 
-def _cross_entropy(target: List[str], context: List[str], smooth=True):
-    """
-    Calculates H(P,Q) = -sum_{x\in X}(P(x) * log(Q(x)))
-
-    :param target: list of tokens that make up the target text (P)
-    :param context: list of tokens that make up the context (Q)
-    :param smooth: whether to use add 1 smoothing for OOV tokens
+from convokit import Transformer
+from convokit.model import Corpus, Utterance, CorpusComponent
+from convokit.util import random_sampler
+from .convokit_lm import ConvoKitLanguageModel
 
-    :return: cross entropy
-    """
-    N_target, N_context = len(target), len(context)
-    if min(N_target, N_context) == 0:
-        return np.nan
-    context_counts = Counter(context)
-    V = len(context_counts) + 1 if smooth else 0
-    k = 1 if smooth else 0
-    val = 0 if smooth else 1
-    return (
-        sum(-np.log((context_counts.get(tok, val) + k) / (N_context + V)) for tok in target)
-        / N_target
-    )
-
-
-def sample(tokens: List[Union[np.ndarray, List[str]]], sample_size: int, n_samples=50, p=None):
-    """
-    Generates random samples from a list of lists of tokens.
+try:
+    shell = get_ipython().__class__.__name__
+    if shell == "ZMQInteractiveShell" or shell == "TerminalInteractiveShell":
+        from tqdm.notebook import tqdm
+except (NameError, ModuleNotFoundError, ImportError):
+    pass
 
-    :param toks: a list of lists of tokens to sample from.
-    :param sample_size: the number of tokens to include in each sample.
-    :param n_samples: the number of samples to take.
 
-    :return: numpy array where each row is a sample of tokens
-    """
-    if not sample_size:
-        assert len(tokens) == 1
-        return np.tile(tokens[0], (n_samples, 1))
-    tokens_list = np.array([toks for toks in tokens if len(toks) >= sample_size])
-    if tokens_list.shape[0] == 0:
-        return None
-    rng = np.random.default_rng()
-    sample_idxes = rng.integers(0, tokens_list.shape[0], size=(n_samples))
-    return np.array([rng.choice(tokens_list[i], sample_size) for i in sample_idxes])
+class Surprise(Transformer):
+    """Measures the amount of "surprise" between target and context utterance(s).
 
+    This transformer computes how surprising a target utterance or group of utterances is, when
+    compared to some context. The amount of "surprise" is measured by comparing the deviation
+    of the target distribution from the context distribution (e.g., cross-entropy, perplexity).
+    Furthermore, to mitigate the effects of text length on language model evaluation, the surprise
+    transformer uses several random fixed length samples from target and context text.
 
-class Surprise(Transformer):
-    """
-    Computes how surprising a target (an utterance or group of utterances) is based on some context.
-    The measure for surprise used is cross entropy. Uses fixed size samples from target and context text
-    to mitigate effects of length on cross entropy.
-
-    :param model_key_selector: function that defines how utterances should be mapped to models.
-        Takes in an utterance and returns the key to use for mapping the utterance to a corresponding model.
-    :param tokenize: optional function that takes in a string and returns a list of tokens in that string.
-        default: nltk's word_tokenize
-    :param surprise_attr_name: the name for the metadata attribute to add to objects.
-        default: surprise
-    :param target_sample_size: number of tokens to sample from each target (test text). If `None`, then the entire target will be used.
-    :param context_sample_size: number of tokens to sample from each context (training text). If `None`, then the entire context will be used.
-    :param n_samples: number of samples to take for each target-context pair.
-    :param sampling_fn: function for generating samples of tokens.
-    :param smooth: whether to use laplace smoothing when calculating surprise.
+    :param model_key_selector: A function that specifies how utterances are to be mapped to models.
+        The function takes in an utterance and returns the key to use in mapping the utterance to a
+        corresponding model.
+    :param tokenizer: A function that returns a list of tokens in a given string, defaults to
+        `nltk.word_tokenize`.
+    :param surprise_attr_name: The name for the metadata attribute to add to the objects, defaults
+        to "surprise".
+    :param target_sample_size: The number of tokens to sample from each target (test text); when
+        specified as `None`, then the entire target will be used, defaults to 100.
+    :param context_sample_size: The number of tokens to sample from each context (training text);
+        when specified as `None`, then the entire context will be used, defaults to 100.
+    :param n_samples: The number of samples to take for each target-context pair, defaults to 50.
+    :param sampling_fn: A function to generate samples of tokens, defaults to a random sampler.
+    :param n_jobs: The number of concurrent threads to be used for routines that are parallelized
+        with `joblib`, defaults to 1.
     """
 
     def __init__(
         self,
         model_key_selector: Callable[[Utterance], str],
         tokenizer: Callable[[str], List[str]] = word_tokenize,
-        surprise_attr_name="surprise",
-        target_sample_size=100,
-        context_sample_size=100,
-        n_samples=50,
-        sampling_fn: Callable[[np.ndarray, int], np.ndarray] = sample,
-        smooth: bool = True,
+        surprise_attr_name: str = "surprise",
+        target_sample_size: int = 100,
+        context_sample_size: int = 100,
+        n_samples: int = 50,
+        sampling_fn: Callable[
+            [List[Union[np.ndarray, List[str]]], int, int], np.ndarray
+        ] = random_sampler,
+        n_jobs: int = 1,
     ):
-        self.model_key_selector = model_key_selector
-        self.tokenizer = tokenizer
-        self.surprise_attr_name = surprise_attr_name
-        self.target_sample_size = target_sample_size
-        self.context_sample_size = context_sample_size
-        self.n_samples = n_samples
-        self.sampling_fn = sampling_fn
-        self.smooth = smooth
+        self._model_key_selector = model_key_selector
+        self._tokenizer = tokenizer
+        self._surprise_attr_name = surprise_attr_name
+        self._target_sample_size = target_sample_size
+        self._context_sample_size = context_sample_size
+        self._n_samples = n_samples
+        self._sampling_fn = sampling_fn
+        self._n_jobs = n_jobs
+        self._model_groups = None
 
     def fit(
         self,
         corpus: Corpus,
         text_func: Callable[[Utterance], List[str]] = None,
         selector: Callable[[Utterance], bool] = lambda utt: True,
-    ):
-        """
-        Fits a model for each group of utterances in a corpus. The group that an
-        utterance belongs to is determined by the `model_key_selector` parameter in
-        the transformer's constructor.
-
-        :param corpus: corpus to create models from.
-        :param text_func: optional function to define how the text a model is trained
-            on should be selected. Takes an utterance as input and returns a list of
-            strings to train the model corresponding to that utterance on. The model
-            corresponding to the utterance is determined by `self.model_key_selector`.
-            For every utterance corresponding to the same model key, this function
-            should return the same result.
-            If `text_func` is `None`, a model will be trained on the text from all
-            the utterances that belong to its group.
-        :param selector: determines which utterances in the corpus to train models for.
+    ) -> Transformer:
+        """Populate models for each group of utterances in a corpus.
+
+        For each group of utterances in the corpus, a specific model is populated. The group that an
+        utterance belongs to is determined by the `model_key_selector` parameter in the constructor.
+        Furthermore, based on the `tokenizer` specified in the constructor, the text corresponding
+        to the model key is tokenized.
+
+        :param corpus: The corpus to create models from.
+        :param text_func: The function used to define how the text a model is trained on should be
+            selected. Takes an utterance as input and returns a list of strings to train the model
+            corresponding to that utterance on. The model corresponding to the utterance is
+            determined by the `model_key_selector` parameter specified in the constructor. For each
+            utterance corresponding to the same model key, this function should return the same
+            result.
+            Defaults to `None`; when the value is `None`, a model will be trained on the text from
+            all the utterances that belong to its group.
+        :param selector: A function to specify which utterances in the corpus to train models for.
+            Defaults to choosing all utterances, `lambda utt: True`.
+        :return: An instance of the Surprise transformer with the corresponding models populated.
         """
-        self.model_groups = defaultdict(list)
-        for utt in tqdm(corpus.iter_utterances(selector=selector), desc="fit1"):
-            key = self.model_key_selector(utt)
-            if text_func:
-                if key not in self.model_groups:
-                    self.model_groups[key] = text_func(utt)
+        self._model_groups = defaultdict(list)
+
+        for utt in tqdm(corpus.iter_utterances(selector=selector), desc="fit"):
+            key = self._model_key_selector(utt)
+            if text_func is not None:
+                if key not in self._model_groups:
+                    self._model_groups[key] = text_func(utt)
             else:
-                self.model_groups[key].append(utt.text)
-        for key in tqdm(self.model_groups, desc="fit2"):
-            if not text_func:
-                self.model_groups[key] = [" ".join(self.model_groups[key])]
-            self.model_groups[key] = list(map(lambda x: self.tokenizer(x), self.model_groups[key]))
+                self._model_groups[key].append(utt.text)
+
+        for key in tqdm(self._model_groups, desc="fit"):
+            if text_func is None:
+                self._model_groups[key] = [" ".join(self._model_groups[key])]
+            # Using `map()` with `lambda` is (microscopically) costlier than a list comprehension.
+            # Reference: https://stackoverflow.com/a/1247490/6907625.
+            self._model_groups[key] = [
+                self._tokenizer(utt_text) for utt_text in self._model_groups[key]
+            ]
+
         return self
 
-    def transform(
+    def _compute_surprise(
+        self,
+        target: List[str],
+        context: List[List[str]],
+        lm_evaluation_fn: Callable[
+            [Union[List[str], np.ndarray], Union[List[str], np.ndarray], Optional[Any]],
+            np.ndarray,
+        ],
+        **kwargs: Optional[Any],
+    ) -> np.ndarray:
+        """Compute the amount of "surprise" between target and context utterance(s).
+
+        This method computes how surprising a target text is, when compared to some context. The
+        amount of "surprise" is measured by comparing the deviation of the target distribution from
+        the context distribution (e.g., cross-entropy, perplexity). Furthermore, to mitigate the
+        effects of text length on language model evaluation, several random samples of fixed sizes
+        are taken from the target and context.
+
+        :param target: A list of tokens in the target.
+        :param context: A list of lists of tokens in each group of the context.
+        :param lm_evaluation_fn: The language model evaluation function. If using an instance of
+            :py:class:`~convokit.LanguageModel`, the :py:meth:`~convokit.LanguageModel.evaluate`
+            function is to be used here. To see examples of :py:class:`~convokit.LanguageModel`,
+            see: :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. This
+            function takes in a list of target samples and corresponding context samples, and
+            returns the amount of surprise using some underlying language model evaluation metric.
+        :param kwargs: Additional keyword arguments to be passed to the language model evaluation
+            function:
+
+            * When using :py:class:`~convokit.LanguageModel`, the following keywords are relevant:
+
+                * `eval_type`: The language model evaluation metric, defaults to "cross_entropy".
+                * The following arguments, if specified, overwrite the existing class values:
+
+                    * `n_jobs`: The number of concurrent threads to be used for routines that are
+                      parallelized with `joblib`, defaults to 1.
+                    * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to
+                      "language_model".
+
+            * When using :py:class:`~convokit.ConvoKitLanguageModel`, the following keywords are
+              relevant:
+
+                * `eval_type`: The language model evaluation metric, defaults to "cross_entropy".
+                * The following arguments, if specified, overwrite the existing class values:
+
+                    * `smooth`: Indicator of using Laplace smoothing in the computation of surprise
+                      scores, defaults to `True`.
+
+                * The following arguments, inherited from :py:class:`~convokit.LanguageModel`, if
+                  specified, overwrite the existing class values:
+
+                    * `n_jobs`: The number of concurrent threads to be used for routines that are
+                      parallelized with `joblib`, defaults to 1.
+                    * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to
+                      "convokit_lm".
+
+            * When using :py:class:`~convokit.Kenlm`, the following keywords are relevant:
+
+                * `eval_type`: The language model evaluation metric, defaults to "cross_entropy".
+                * The following arguments, if specified, overwrite the existing class values:
+
+                    * `ngram_order`: The order of n-gram language model.
+                    * `trained_model_filepath`: The filepath to a pre-trained language model that is
+                      to be persistently used.
+                    * `is_persistent`: Indicator of model persistence, i.e., the model generated
+                      in the first pass or that loaded from `trained_model_filepath` is used in all
+                      evaluations. When `trained_model_filepath` is specified, persistence is
+                      automatically implied.
+                    * `kenlm_path`: The folder path to the folder of KenLM library.
+                    * `models_dir`: The folder path to store the (trained) binary KenLM models.
+                    * `model_filename`: The filename used in storing the KenLM model artefacts.
+
+                * The following arguments, inherited from :py:class:`~convokit.LanguageModel`, if
+                  specified, overwrite the existing class values:
+
+                    * `n_jobs`: The number of concurrent threads to be used for routines that are
+                      parallelized with `joblib`, defaults to 1.
+                    * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to
+                      "kenlm".
+        :return: The surprise score output by the language model evaluation function.
+        """
+        target_tokens = np.array(target)
+        context_tokens = [np.array(text) for text in context]
+        target_samples = self._sampling_fn(
+            [target_tokens], self._target_sample_size, self._n_samples
+        )
+        context_samples = self._sampling_fn(
+            context_tokens, self._context_sample_size, self._n_samples
+        )
+
+        if target_samples is None or context_samples is None:
+            return np.nan
+        return lm_evaluation_fn(target_samples, context_samples, **kwargs)
+
+    def _transform(
         self,
         corpus: Corpus,
         obj_type: str,
         group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None,
-        group_model_attr_key: Callable[[str, str], str] = None,
-        selector: Callable[[CorpusComponent], bool] = lambda _: True,
         target_text_func: Callable[[Utterance], List[str]] = None,
-    ):
-        """
-        Annotates `obj_type` components in a corpus with surprise scores. Should be
-        called after fit().
-
-        :param corpus: corpus to compute surprise for.
-        :param obj_type: the type of corpus components to annotate. Should be either
-            'utterance', 'speaker', 'conversation', or 'corpus'.
-        :param group_and_models: optional function that defines how an utterance should
-            be grouped to form a target text and what models (contexts) the group should
-            be compared to when calculating surprise. Takes in an utterance and returns
-            a tuple containing the name of the group the utterance belongs to and a
-            list of models to calculate how surprising that group is against. Objects
-            will be annotated with a metadata field `self.surprise_attr_name` that is
-            maps a key corresponding to the `groupname` and `modelkey` to the surprise
-            score for utterances in the group when compared to the model. The key used
-            is defined by the `group_model_attr_key` parameter.
-            If `group_and_models` is `None`, `self.model_key_selector` will be used
-            to select the group that an utterance belongs to. The surprise score will
-            be calculated for each group of utterances compared to the model in
+        selector: Callable[[CorpusComponent], bool] = lambda _: True,
+        group_model_attr_key: Callable[[str, str], str] = None,
+        **kwargs: Optional[Any],
+    ) -> Corpus:
+        """Annotates `obj_type` components in a corpus with surprise scores.
+
+        The transform function adds surprise score metadata to the `obj_type` components in the
+        given corpus.
+
+        :param corpus: The corpus to compute surprise for.
+        :param obj_type: The type of corpus components to annotate. Should be one of "utterance",
+            "speaker", "conversation", or "corpus".
+        :param group_and_models: A function that defines how an utterance should be grouped to form
+            a target text and what models (contexts) the group should be compared to in calculating
+            surprise scores. Takes in an utterance and returns a tuple containing the name of the
+            group the utterance belongs to and a list of models to calculate how surprising that
+            group is against. Objects will be annotated with a metadata field `surprise_attr_name`
+            (specified in the constructor) that maps a key corresponding to the `group_name` and
+            `model_key` to the surprise score for the utterances in the group when compared to the
+            model. The key used is defined by the `group_model_attr_key` parameter.
+            Defaults to `None`; if `group_and_models` is `None`, `model_key_selector` specified in
+            the constructor will be used to select the group that an utterance belongs to. The
+            surprise score will be calculated for each group of utterances compared to the model in
             `self.models` corresponding to the group.
-        :param group_model_attr_key: optional function to define what key should be used
-            for a given `groupname` and `modelkey`.
-            If `group_model_attr_key` is `None`, the default key used will be
-            "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal
-            in which case just "modelkey" will be used as the key.
-        :param selector: function to select objects to annotate. if function returns true, object will be annotated.
-        :param target_text_func: optional function to define what the target text corresponding to an utterance should be.
-            takes in an utterance and returns a list of string tokens
+        :param target_text_func: A function to define what the target text corresponding to an
+            utterance should be; takes in an utterance and returns a list of string tokens.
+            Defaults to `None`.
+        :param selector: A function to specify which objects in the corpus to train models for,
+            defaults to choosing all `obj_type` objects, `lambda _: True`.
+        :param group_model_attr_key: A function that defines what key is to be used for a given
+            `group_name` and `model_key`, defaults to `None`. If `group_model_attr_key` is `None`,
+            the default key used will be "GROUP_group_name_MODEL_model_key" unless `group_name` and
+            `model_key` are equal, in which case just "model_key" will be used as the key.
+        :param kwargs: Additional keyword arguments to be passed for surprise computations (see
+            the documentation for :py:meth:`~Surprise._compute_surprise()` for these arguments), and
+            in creating the language model (if needed):
+
+            * `language_model`: An instance of :py:class:`~convokit.LanguageModel` to be used in
+              computing the surprise scores, defaults to :py:class:`~convokit.ConvoKitLanguageModel`
+              and the arguments to the :py:class:`~convokit.ConvoKitLanguageModel` can be specified
+              here as:
+
+                * `smooth`: Indicator of using Laplace smoothing in the computation of surprise
+                  scores, defaults to `True`.
+                * `n_jobs`: The number of concurrent threads to be used for routines that are
+                  parallelized with `joblib`, defaults to 1.
+                * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to
+                  "convokit_lm".
+        :return: A modified version of the input corpus with the surprise scores.
         """
+
+        def _update_groups_models(
+            utt_: Utterance,
+            utt_groups_: Dict[str, List[List[str]]],
+            group_models_: Dict[str, Set[str]],
+        ):
+            """Updates the utterance groups and models based on `groups_and_models`.
+
+            :param utt_: The utterance whose groups and models are to be populated (updated).
+            :param utt_groups_: Update utterance groups based on `groups_and_models` parameter. The
+                dictionary is modified in place.
+            :param group_models_: Update utterance models based on `groups_and_models` parameter.
+                The dictionary is modified in place.
+            """
+            group_name, models = (
+                group_and_models(utt_)
+                if group_and_models
+                else (self._model_key_selector(utt_), None)
+            )
+            models = {group_name} if not models else models
+            if target_text_func:
+                if group_name not in utt_groups_:
+                    utt_groups_[group_name] = [target_text_func(utt_)]
+            else:
+                utt_groups_[group_name].append(self._tokenizer(utt_.text))
+            group_models_[group_name].update(models)
+
+        def _format_attr_key(
+            group_name: str, model_key: str, format_fn: Callable[[str, str], str] = None
+        ) -> str:
+            """Formats the surprise score attribute key, given model name and key.
+
+            :param group_name: The group name to be included in the surprise score attribute key.
+            :param model_key: The model key to be included in the surprise score attribute key.
+            :param format_fn: A function that takes in the `group_name` and `model_key` and outputs
+                the formatted attribute key, defaults to `None`. When `group_model_attr_key` is
+                `None`, the default key used will be "GROUP_group_name_MODEL_model_key" unless
+                `group_name` and `model_key` are equal, in which case just "model_key" will be used
+                as the key.
+            :return: The formatted surprise score attribute key.
+            """
+            if format_fn:
+                return format_fn(group_name, model_key)
+            if group_name == model_key:
+                return model_key
+            return f"GROUP_{group_name}__MODEL_{model_key}"
+
+        def __surprise_score_helper(
+            group_name: str,
+            utt_group: List[List[str]],
+            group_models_: Dict[str, Set[str]],
+            surprise_scores_: Dict[str, np.ndarray],
+            lm_evaluation_fn: Callable[
+                [
+                    Union[List[str], np.ndarray],
+                    Union[List[str], np.ndarray],
+                    Optional[Any],
+                ],
+                np.ndarray,
+            ],
+        ):
+            """A helper function to aid in the computation of surprise scores.
+
+            :param group_name: The group name corresponding to the group model to be used.
+            :param utt_group: The utterance group from those populated using `groups_and_models`.
+            :param group_models_: The group models that were populated using `groups_and_models`.
+            :param surprise_scores_: The surprise score (dictionary value) that is to be updated for
+                the corresponding utterance group and model. The dictionary is modified in place.
+            :param lm_evaluation_fn: The language model evaluation function. If using an instance
+                of :py:class:`~convokit.LanguageModel`, :py:meth:`~convokit.LanguageModel.evaluate`
+                function is to be used here. To see examples of :py:class:`~convokit.LanguageModel`,
+                see: :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`.
+                The function takes in a list of target samples and corresponding context samples,
+                and returns the amount of surprise using some underlying model evaluation metric.
+            """
+            for model_key in group_models_[group_name]:
+                assert model_key in self._model_groups, "invalid model key"
+                surprise_key = _format_attr_key(group_name, model_key, group_model_attr_key)
+                context = self._model_groups[model_key]
+                target = list(chain(*utt_group))
+                surprise_scores_[surprise_key] = self._compute_surprise(
+                    target, context, lm_evaluation_fn, **kwargs
+                )
+
+        def _update_surprise_scores(
+            utt_groups_: Dict[str, List[List[str]]],
+            group_models_: Dict[str, Set[str]],
+            surprise_scores_: Dict[str, np.ndarray],
+            lm_evaluation_fn: Callable[
+                [
+                    Union[List[str], np.ndarray],
+                    Union[List[str], np.ndarray],
+                    Optional[Any],
+                ],
+                np.ndarray,
+            ],
+        ):
+            """Populate (update) the surprise score for utterance groups and models.
+
+            :param utt_groups_: The utterance groups that were populated using `groups_and_models`.
+            :param group_models_: The group models that were populated using `groups_and_models`.
+            :param surprise_scores_: The surprise scores (dictionary values) that are to be updated
+                for the corresponding utterance groups and models. The surprise scores dictionary is
+                modified in place.
+            :param lm_evaluation_fn: The language model evaluation function. If using an instance
+                of :py:class:`~convokit.LanguageModel`, the `evaluate` function is to be used here.
+                To see the subclass implementations of :py:class:`~convokit.LanguageModel`, see:
+                :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. The
+                function takes in a list of target samples and corresponding context samples, and
+                returns the amount of surprise using some underlying model evaluation metric.
+            """
+            if self._n_jobs == 1:
+                for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2):
+                    __surprise_score_helper(
+                        group_name,
+                        utt_groups_[group_name],
+                        group_models_,
+                        surprise_scores_,
+                        lm_evaluation_fn,
+                    )
+            else:
+                Parallel(n_jobs=self._n_jobs, backend="threading")(
+                    delayed(__surprise_score_helper)(
+                        group_name,
+                        utt_groups_[group_name],
+                        group_models_,
+                        surprise_scores_,
+                        lm_evaluation_fn,
+                    )
+                    for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2)
+                )
+
+        if "n_jobs" in kwargs and kwargs["n_jobs"] != self._n_jobs:
+            warnings.warn(
+                f"specified n_jobs={kwargs['n_jobs']}; however, the surprise transformer was "
+                f"initialized with {self._n_jobs}, so defaulting to {self._n_jobs} jobs."
+            )
+            kwargs["n_jobs"] = self._n_jobs
+        language_model = (
+            kwargs["language_model"]
+            if "language_model" in kwargs
+            else ConvoKitLanguageModel(**kwargs)
+        )
+
         if obj_type == "corpus":
-            utt_groups = defaultdict(list)
-            group_models = defaultdict(set)
-            for utt in corpus.iter_utterances():
-                if group_and_models:
-                    group_name, models = group_and_models(utt)
-                else:
-                    group_name = self.model_key_selector(utt)
-                    models = {group_name}
-                if target_text_func:
-                    if group_name not in utt_groups:
-                        utt_groups[group_name] = [target_text_func(utt)]
-                else:
-                    utt_groups[group_name].append(self.tokenizer(utt.text))
-                group_models[group_name].update(models)
-            surprise_scores = {}
-            for group_name in tqdm(utt_groups, desc="transform"):
-                for model_key in group_models[group_name]:
-                    context = self.model_groups[model_key]
-                    target = list(chain(*utt_groups[group_name]))
-                    surprise_scores[
-                        Surprise._format_attr_key(group_name, model_key, group_model_attr_key)
-                    ] = self._compute_surprise(target, context)
-            corpus.add_meta(self.surprise_attr_name, surprise_scores)
+            surprise_scores = defaultdict()
+            utt_groups, group_models = defaultdict(list), defaultdict(set)
+            for utt in tqdm(corpus.iter_utterances(), desc="transform"):
+                _update_groups_models(utt, utt_groups, group_models)
+            _update_surprise_scores(
+                utt_groups, group_models, surprise_scores, language_model.evaluate
+            )
+            corpus.add_meta(self._surprise_attr_name, surprise_scores)
         elif obj_type == "utterance":
             for utt in tqdm(corpus.iter_utterances(selector=selector), desc="transform"):
-                if group_and_models:
-                    group_name, models = group_and_models(utt)
-                    surprise_scores = {}
-                    for model_key in models:
-                        context = self.model_groups[model_key]
-                        target = (
-                            target_text_func(utt) if target_text_func else self.tokenizer(utt.text)
-                        )
-                        surprise_scores[
-                            Surprise._format_attr_key(group_name, model_key, group_model_attr_key)
-                        ] = self._compute_surprise(target, context)
-                    utt.add_meta(self.surprise_attr_name, surprise_scores)
-                else:
-                    group_name = self.model_key_selector(utt)
-                    context = self.model_groups[group_name]
-                    target = target_text_func(utt) if target_text_func else self.tokenizer(utt.text)
-                    utt.add_meta(self.surprise_attr_name, self._compute_surprise(target, context))
+                surprise_scores = defaultdict()
+                utt_groups, group_models = defaultdict(list), defaultdict(set)
+                _update_groups_models(utt, utt_groups, group_models)
+                _update_surprise_scores(
+                    utt_groups, group_models, surprise_scores, language_model.evaluate
+                )
+                utt.add_meta(self._surprise_attr_name, surprise_scores)
         else:
             for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc="transform"):
-                utt_groups = defaultdict(list)
-                group_models = defaultdict(set)
+                surprise_scores = defaultdict()
+                utt_groups, group_models = defaultdict(list), defaultdict(set)
                 for utt in obj.iter_utterances():
-                    if group_and_models:
-                        group_name, models = group_and_models(utt)
-                    else:
-                        group_name = self.model_key_selector(utt)
-                        models = {group_name}
-                    if target_text_func:
-                        if group_name not in utt_groups:
-                            utt_groups[group_name] = [target_text_func(utt)]
-                    else:
-                        utt_groups[group_name].append(self.tokenizer(utt.text))
-                    group_models[group_name].update(models)
-                surprise_scores = {}
-                for group_name in utt_groups:
-                    for model_key in group_models[group_name]:
-                        assert model_key in self.model_groups, "invalid model key"
-                        if not self.model_groups[model_key]:
-                            continue
-                        context = self.model_groups[model_key]
-                        target = list(chain(*utt_groups[group_name]))
-                        surprise_scores[
-                            Surprise._format_attr_key(group_name, model_key, group_model_attr_key)
-                        ] = self._compute_surprise(target, context)
-                obj.add_meta(self.surprise_attr_name, surprise_scores)
+                    _update_groups_models(utt, utt_groups, group_models)
+                _update_surprise_scores(
+                    utt_groups, group_models, surprise_scores, language_model.evaluate
+                )
+                obj.add_meta(self._surprise_attr_name, surprise_scores)
         return corpus
 
-    def _compute_surprise(self, target: List[str], context: List[List[str]]):
-        """
-        Computes how surprising a target text is based on a context. Surprise scores are calculated using cross entropy.
-        To mitigate length based effects on cross entropy, several random sample of fixed sizes are taken from the traget and context.
-        Returns the average of the cross entropies for all pairs of samples.
+    def transform(self, corpus: Corpus, **kwargs) -> Corpus:
+        """A wrapper over :py:meth:`~convokit.Surprise._transform` of the Surprise transformer.
 
-        :param target: a list of tokens in the target
-        :param context: a list of lists of tokens in each group of the context
+        Note: Since the transformer's :py:meth:`~convokit.Surprise.fit` method populates the model
+        groups, the :py:meth:`~convokit.Surprise.transform` function is to be called after calling
+        :py:meth:`~convokit.Surprise.fit`.
 
-        :return: surprise score
+        :param corpus: The corpus to transform.
+        :param kwargs: Any keyword arguments to be passed to :py:meth:`~convokit.Surprise.transform`
+            function of the Surprise transformer (e.g., `eval_type`). Refer to the documentation of
+            :py:meth:`~convokit.Surprise._transform()` for specific keyword arguments.
+        :return: A modified version of the input corpus with the surprise scores.
         """
-        target_tokens = np.array(target)
-        context_tokens = [np.array(text) for text in context]
-        target_samples = self.sampling_fn([target_tokens], self.target_sample_size, self.n_samples)
-        context_samples = self.sampling_fn(context_tokens, self.context_sample_size, self.n_samples)
-        if target_samples is None or context_samples is None:
-            return np.nan
-        return np.nanmean(
-            [
-                _cross_entropy(target_sample, context_sample, self.smooth)
-                for target_sample, context_sample in zip(target_samples, context_samples)
-            ]
-        )
-
-    @staticmethod
-    def _format_attr_key(group_name, model_key, format_fn=None):
-        if format_fn:
-            return format_fn(group_name, model_key)
-        if group_name == model_key:
-            return model_key
-        return f"GROUP_{group_name}__MODEL_{model_key}"
+        return self._transform(corpus=corpus, **kwargs)
diff --git a/convokit/tests/surprise/__init__.py b/convokit/tests/surprise/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/convokit/tests/surprise/test_convokit_lm.py b/convokit/tests/surprise/test_convokit_lm.py
new file mode 100644
index 00000000..7ee02180
--- /dev/null
+++ b/convokit/tests/surprise/test_convokit_lm.py
@@ -0,0 +1,75 @@
+import unittest
+
+from convokit import ConvoKitLanguageModel
+
+
+class TestConvoKitLanguageModel(unittest.TestCase):
+    def _init(self, target_samples, context_samples):
+        self._target_samples = target_samples
+        self._context_samples = context_samples
+
+    def test_cross_entropy_smooth(self):
+        convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=True)
+        score = convokit_lm.evaluate(
+            self._target_samples, self._context_samples, eval_type="cross_entropy"
+        )
+        self.assertEqual(round(float(score), 2), 1.38)
+
+    def test_cross_entropy_no_smooth(self):
+        convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=False)
+        score = convokit_lm.evaluate(
+            self._target_samples, self._context_samples, eval_type="cross_entropy"
+        )
+        self.assertEqual(round(float(score), 2), 1.04)
+
+    def test_perplexity_smooth(self):
+        convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=True)
+        score = convokit_lm.evaluate(
+            self._target_samples, self._context_samples, eval_type="perplexity"
+        )
+        self.assertEqual(round(float(score), 2), 4.02)
+
+    def test_perplexity_no_smooth(self):
+        convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=False)
+        score = convokit_lm.evaluate(
+            self._target_samples, self._context_samples, eval_type="perplexity"
+        )
+        self.assertEqual(round(float(score), 2), 3.00)
+
+
+class TestWithMemory(TestConvoKitLanguageModel):
+    def setUp(self) -> None:
+        self._target_samples = [["this", "is", "test"], ["is", "test"]]
+        self._context_samples = [["this", "is", "a", "test"], ["this", "test"]]
+        super()._init(self._target_samples, self._context_samples)
+
+    def test_cross_entropy_smooth(self):
+        super().test_cross_entropy_smooth()
+
+    def test_cross_entropy_no_smooth(self):
+        super().test_cross_entropy_no_smooth()
+
+    def test_perplexity_smooth(self):
+        super().test_perplexity_smooth()
+
+    def test_perplexity_no_smooth(self):
+        super().test_perplexity_no_smooth()
+
+
+class TestWithDb(TestConvoKitLanguageModel):
+    def setUp(self) -> None:
+        self._target_samples = [["this", "is", "test"], ["is", "test"]]
+        self._context_samples = [["this", "is", "a", "test"], ["this", "test"]]
+        super()._init(self._target_samples, self._context_samples)
+
+    def test_cross_entropy_smooth(self):
+        super().test_cross_entropy_smooth()
+
+    def test_cross_entropy_no_smooth(self):
+        super().test_cross_entropy_no_smooth()
+
+    def test_perplexity_smooth(self):
+        super().test_perplexity_smooth()
+
+    def test_perplexity_no_smooth(self):
+        super().test_perplexity_no_smooth()
diff --git a/convokit/tests/surprise/test_language_model.py b/convokit/tests/surprise/test_language_model.py
new file mode 100644
index 00000000..45161b49
--- /dev/null
+++ b/convokit/tests/surprise/test_language_model.py
@@ -0,0 +1,139 @@
+import unittest
+
+import nltk.lm as nltk_lm
+from nltk.util import ngrams, everygrams
+
+from convokit.surprise import language_model
+
+
+class TestLm(language_model.LanguageModel):
+    def __init__(self):
+        super().__init__("test_language_model")
+
+    @staticmethod
+    def eval_func(target, context):
+        return abs(len(context) - len(target))
+
+
+class TestNltkLm(language_model.LanguageModel):
+    def __init__(self, ngram_order=2):
+        super().__init__("test_nltk_language_model")
+        self._ngram_order = ngram_order
+
+    def eval_func(self, target, context):
+        kneser_ney_lm = nltk_lm.models.KneserNeyInterpolated(
+            order=self._ngram_order, vocabulary=nltk_lm.Vocabulary(target + context)
+        )
+        kneser_ney_lm.fit([everygrams(context, max_len=self._ngram_order)])
+        return kneser_ney_lm.entropy(ngrams(target, n=self._ngram_order))
+
+
+class TestLanguageModel(unittest.TestCase):
+    def _init(self, target_samples, context_samples):
+        self._target_samples = target_samples
+        self._context_samples = context_samples
+
+    def test_model_type(self):
+        test_lm = language_model.LanguageModel(model_type="test_language_model")
+        self.assertEqual(test_lm.type, "test_language_model")
+
+    def test_model_config(self):
+        test_lm = language_model.LanguageModel(model_type="test_language_model", smooth=True)
+        expected_config = {"model_type": "test_language_model", "n_jobs": 1, "smooth": True}
+        self.assertEqual(test_lm.config, expected_config)
+
+    def test_overwrite_args(self):
+        test_lm = language_model.LanguageModel(model_type="test_language_model", smooth=True)
+        try:
+            test_lm.evaluate(self._target_samples, self._context_samples, smooth=False)
+        except RuntimeError:
+            pass
+        expected_config = {"model_type": "test_language_model", "n_jobs": 1, "smooth": False}
+        self.assertEqual(test_lm.config, expected_config)
+
+    def test_evaluate_cross_entropy_runtime_error(self):
+        test_lm = language_model.LanguageModel(model_type="test_language_model")
+        with self.assertRaises(RuntimeError):
+            test_lm.evaluate(self._target_samples, self._context_samples, "cross_entropy")
+
+    def test_evaluate_perplexity_runtime_error(self):
+        test_lm = language_model.LanguageModel(model_type="test_language_model")
+        with self.assertRaises(RuntimeError):
+            test_lm.evaluate(self._target_samples, self._context_samples, "perplexity")
+
+    def test_evaluate_unimplemented_attribute_error(self):
+        test_lm = language_model.LanguageModel(model_type="test_language_model")
+        with self.assertRaises(AttributeError):
+            test_lm.evaluate(self._target_samples, self._context_samples, "unimplemented")
+
+    def test_evaluate(self):
+        test_lm = TestLm()
+        score = test_lm.evaluate(self._target_samples, self._context_samples, "eval_func")
+        self.assertEqual(score, 0.5)
+
+    def test_evaluate_nltk(self):
+        test_lm = TestNltkLm()
+        score = test_lm.evaluate(self._target_samples, self._context_samples, "eval_func")
+        self.assertEqual(round(float(score), 2), 1.25)
+
+
+class TestWithMemory(TestLanguageModel):
+    def setUp(self) -> None:
+        self._target_samples = [["this", "is", "test"], ["is", "test"]]
+        self._context_samples = [["this", "is", "a", "test"], ["this", "test"]]
+        super()._init(self._target_samples, self._context_samples)
+
+    def test_model_type(self):
+        super().test_model_type()
+
+    def test_model_config(self):
+        super().test_model_config()
+
+    def test_overwrite_args(self):
+        super().test_overwrite_args()
+
+    def test_evaluate_cross_entropy_runtime_error(self):
+        super().test_evaluate_cross_entropy_runtime_error()
+
+    def test_evaluate_perplexity_runtime_error(self):
+        super().test_evaluate_perplexity_runtime_error()
+
+    def test_evaluate_unimplemented_attribute_error(self):
+        super().test_evaluate_unimplemented_attribute_error()
+
+    def test_evaluate(self):
+        super().test_evaluate()
+
+    def test_evaluate_nltk(self):
+        super().test_evaluate_nltk()
+
+
+class TestWithDb(TestLanguageModel):
+    def setUp(self) -> None:
+        self._target_samples = [["this", "is", "test"], ["is", "test"]]
+        self._context_samples = [["this", "is", "a", "test"], ["this", "test"]]
+        super()._init(self._target_samples, self._context_samples)
+
+    def test_model_type(self):
+        super().test_model_type()
+
+    def test_model_config(self):
+        super().test_model_config()
+
+    def test_overwrite_args(self):
+        super().test_overwrite_args()
+
+    def test_evaluate_cross_entropy_runtime_error(self):
+        super().test_evaluate_cross_entropy_runtime_error()
+
+    def test_evaluate_perplexity_runtime_error(self):
+        super().test_evaluate_perplexity_runtime_error()
+
+    def test_evaluate_unimplemented_attribute_error(self):
+        super().test_evaluate_unimplemented_attribute_error()
+
+    def test_evaluate(self):
+        super().test_evaluate()
+
+    def test_evaluate_nltk(self):
+        super().test_evaluate_nltk()
diff --git a/convokit/tests/surprise/test_surprise.py b/convokit/tests/surprise/test_surprise.py
new file mode 100644
index 00000000..243f960d
--- /dev/null
+++ b/convokit/tests/surprise/test_surprise.py
@@ -0,0 +1,191 @@
+import random
+import unittest
+
+import numpy as np
+
+from convokit.surprise import Surprise, ConvoKitLanguageModel
+from convokit.tests.test_utils import small_burr_conv_corpus
+
+
+class TestSurprise(unittest.TestCase):
+    def _init(self, corpus) -> None:
+        self._corpus = corpus
+
+    def test_fit_model_groups(self):
+        surprise = Surprise(
+            model_key_selector=lambda utt: "_".join([utt.speaker.id, utt.conversation_id])
+        )
+        surprise = surprise.fit(self._corpus)
+        expected_model_groups = {
+            "hamilton_0": [["Pardon", "me", "."]],
+            "hamilton_1": [["Who", "'s", "asking", "?"]],
+            "hamilton_2": [["Are", "you", "Aaron", "Burr", ",", "sir", "?"]],
+            "burr_0": [["Are", "you", "Aaron", "Burr", ",", "sir", "?"]],
+            "burr_1": [["That", "depends", ".", "Pardon", "me", "."]],
+            "burr_2": [["That", "depends", "."]],
+        }
+        self.assertEqual(surprise._model_groups, expected_model_groups)
+
+    def test_fit_model_groups_text_func_selector(self):
+        surprise = Surprise(
+            model_key_selector=lambda utt: "_".join([utt.speaker.id, utt.conversation_id])
+        )
+        surprise = surprise.fit(
+            self._corpus,
+            text_func=lambda utt: [
+                " ".join(
+                    [
+                        speaker_utt.text
+                        for speaker_utt in utt.speaker.iter_utterances()
+                        if speaker_utt.conversation_id != utt.conversation_id
+                    ]
+                )
+            ],
+            selector=lambda utt: utt.conversation_id == "0",
+        )
+        expected_model_groups = {
+            "hamilton_0": [
+                ["Who", "'s", "asking", "?", "Are", "you", "Aaron", "Burr", ",", "sir", "?"]
+            ],
+            "burr_0": [["That", "depends", ".", "Pardon", "me", ".", "That", "depends", "."]],
+        }
+        self.assertEqual(surprise._model_groups, expected_model_groups)
+
+    def test_transform_large_context_target_size(self):
+        surprise = Surprise(model_key_selector=lambda utt: "corpus")
+        surprise = surprise.fit(
+            self._corpus,
+            text_func=lambda utt: [
+                " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()])
+            ],
+        )
+        transformed_corpus = surprise.transform(self._corpus, obj_type="utterance")
+
+        utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"]
+        surprise_scores = np.array([score["corpus"] for score in utts])
+        self.assertTrue(np.isnan(surprise_scores).all())
+
+    def test_transform_multiple_jobs(self):
+        surprise = Surprise(model_key_selector=lambda utt: "corpus", n_jobs=2)
+        surprise = surprise.fit(
+            self._corpus,
+            text_func=lambda utt: [
+                " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()])
+            ],
+        )
+        transformed_corpus = surprise.transform(self._corpus, obj_type="utterance", n_jobs=2)
+
+        utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"]
+        surprise_scores = np.array([score["corpus"] for score in utts])
+        self.assertTrue(np.isnan(surprise_scores).all())
+
+    def test_transform_convokit_language_model(self):
+        random.Random(42)
+        surprise = Surprise(
+            model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3
+        )
+        surprise = surprise.fit(
+            self._corpus,
+            text_func=lambda utt: [
+                " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()])
+            ],
+        )
+        language_model = ConvoKitLanguageModel(smooth=False)
+        transformed_corpus = surprise.transform(
+            self._corpus, obj_type="utterance", language_model=language_model
+        )
+
+        utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"]
+        surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1)
+        expected_scores = np.array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1])
+        self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01))
+
+    def test_transform_language_model_parameters(self):
+        random.Random(42)
+        surprise = Surprise(
+            model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3
+        )
+        surprise = surprise.fit(
+            self._corpus,
+            text_func=lambda utt: [
+                " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()])
+            ],
+        )
+        transformed_corpus = surprise.transform(self._corpus, obj_type="utterance", smooth=False)
+
+        utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"]
+        surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1)
+        expected_scores = np.array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1])
+        self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01))
+
+    def test_transform(self):
+        random.Random(42)
+        surprise = Surprise(
+            model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3
+        )
+        surprise = surprise.fit(
+            self._corpus,
+            text_func=lambda utt: [
+                " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()])
+            ],
+        )
+        transformed_corpus = surprise.transform(self._corpus, obj_type="utterance")
+
+        utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"]
+        surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1)
+        expected_scores = np.array([1.8, 1.7, 1.7, 1.8, 1.7, 1.8, 1.8])
+        self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01))
+
+
+class TestWithMemory(TestSurprise):
+    def setUp(self) -> None:
+        self._small_burr_corpus = small_burr_conv_corpus()
+        super()._init(self._small_burr_corpus)
+
+    def test_fit_model_groups(self):
+        super().test_fit_model_groups()
+
+    def test_fit_model_groups_text_func_selector(self):
+        super().test_fit_model_groups_text_func_selector()
+
+    def test_transform_large_context_target_size(self):
+        super().test_transform_large_context_target_size()
+
+    def test_transform_multiple_jobs(self):
+        super().test_transform_multiple_jobs()
+
+    def test_transform_convokit_language_model(self):
+        super().test_transform_convokit_language_model()
+
+    def test_transform_language_model_parameters(self):
+        super().test_transform_language_model_parameters()
+
+    def test_transform(self):
+        super().test_transform()
+
+
+class TestWithDb(TestSurprise):
+    def setUp(self) -> None:
+        self._small_burr_corpus = small_burr_conv_corpus()
+        super()._init(self._small_burr_corpus)
+
+    def test_fit_model_groups(self):
+        super().test_fit_model_groups()
+
+    def test_fit_model_groups_text_func_selector(self):
+        super().test_fit_model_groups_text_func_selector()
+
+    def test_transform_large_context_target_size(self):
+        super().test_transform_large_context_target_size()
+
+    def test_transform_multiple_jobs(self):
+        super().test_transform_multiple_jobs()
+
+    def test_transform_convokit_language_model(self):
+        super().test_transform_convokit_language_model()
+
+    def test_transform_language_model_parameters(self):
+        super().test_transform_language_model_parameters()
+
+    def test_transform(self):
+        super().test_transform()
diff --git a/convokit/tests/test_utils.py b/convokit/tests/test_utils.py
index 42bd5759..d9a210ab 100644
--- a/convokit/tests/test_utils.py
+++ b/convokit/tests/test_utils.py
@@ -9,7 +9,10 @@
 
 FOX_TEXT = "A quick brown fox jumps over the lazy dog."
 BUFFALO_TEXT = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo"
-FOX_BUFFALO_TEXT = "A quick brown fox jumps over the lazy dog. Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo"
+FOX_BUFFALO_TEXT = (
+    "A quick brown fox jumps over the lazy dog. Buffalo buffalo Buffalo buffalo "
+    "buffalo buffalo Buffalo buffalo"
+)
 BURR_SIR_TEXT_1 = "Pardon me. Are you Aaron Burr, sir?"
 BURR_SIR_TEXT_2 = "That depends. Who's asking?"
 BURR_SIR_SENTENCE_1 = "Pardon me."
@@ -129,6 +132,31 @@ def small_burr_corpus():
     return Corpus(utterances=utterances)
 
 
+def small_burr_conv_corpus():
+    hamilton = Speaker(id="hamilton")
+    burr = Speaker(id="burr")
+
+    utterances = [
+        Utterance(id="0", text=BURR_SIR_SENTENCE_1, conversation_id="0", speaker=hamilton),
+        Utterance(
+            id="1", text=BURR_SIR_SENTENCE_2, conversation_id="0", reply_to="0", speaker=burr
+        ),
+        Utterance(id="2", text=BURR_SIR_SENTENCE_3, conversation_id="1", speaker=burr),
+        Utterance(
+            id="3", text=BURR_SIR_SENTENCE_4, conversation_id="1", reply_to="2", speaker=hamilton
+        ),
+        Utterance(
+            id="4", text=BURR_SIR_SENTENCE_1, conversation_id="1", reply_to="3", speaker=burr
+        ),
+        Utterance(id="5", text=BURR_SIR_SENTENCE_2, conversation_id="2", speaker=hamilton),
+        Utterance(
+            id="6", text=BURR_SIR_SENTENCE_3, conversation_id="2", reply_to="5", speaker=burr
+        ),
+    ]
+
+    return Corpus(utterances=utterances)
+
+
 def small_burr_corpus_parsed():
     corpus = small_burr_corpus()
     utterance_infos = [
diff --git a/convokit/util.py b/convokit/util.py
index 6be3f65d..d9f99f7d 100644
--- a/convokit/util.py
+++ b/convokit/util.py
@@ -1,12 +1,15 @@
 import json
 import os
 import shutil
+import tempfile
 import urllib.request
 import uuid
 import warnings
 import zipfile
-from typing import Dict
+from pathlib import Path
+from typing import Dict, Union, Optional, List, IO
 
+import numpy as np
 import requests
 
 
@@ -238,7 +241,6 @@ def download_local(name: str, data_dir: str):
 def _download_helper(
     dataset_path: str, url: str, verbose: bool, name: str, downloadeds_path: str
 ) -> None:
-
     if (
         url.lower().endswith(".corpus")
         or url.lower().endswith(".corpus.zip")
@@ -254,7 +256,15 @@ def _download_helper(
                 if length > 1e6
                 else str(round(length / 1e3, 1)) + "KB"
             )
-            print("Downloading", name, "from", url, "(" + length + ")...", end=" ", flush=True)
+            print(
+                "Downloading",
+                name,
+                "from",
+                url,
+                "(" + length + ")...",
+                end=" ",
+                flush=True,
+            )
         shutil.copyfileobj(response, out_file)
 
     # post-process (extract) corpora
@@ -278,7 +288,9 @@ def _download_helper(
         )  # os.path.join(os.path.dirname(data), name)
         f.write(
             "{}$#${}$#${}\n".format(
-                name, os.path.realpath(os.path.dirname(dataset_path) + "/"), corpus_version(fn)
+                name,
+                os.path.realpath(os.path.dirname(dataset_path) + "/"),
+                corpus_version(fn),
             )
         )
         # f.write(name + "\n")
@@ -292,7 +304,6 @@ def corpus_version(filename: str) -> int:
 
 # retrieve grouping and completes the download link for subreddit
 def get_subreddit_info(subreddit_name: str) -> str:
-
     # base directory of subreddit corpuses
     subreddit_base = "http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/"
     data_dir = subreddit_base + "corpus-zipped/"
@@ -335,13 +346,17 @@ def _get_wikiconv_year_info(year: str) -> str:
 
 
 def _get_supreme_info(year: str) -> str:
-
     supreme_base = "http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/"
     return supreme_base + "supreme-" + year + ".zip"
 
 
 def meta_index(corpus=None, filename: str = None) -> Dict:
-    keys = ["utterances-index", "conversations-index", "speakers-index", "overall-index"]
+    keys = [
+        "utterances-index",
+        "conversations-index",
+        "speakers-index",
+        "overall-index",
+    ]
     if corpus is not None:
         return {k: v for k, v in corpus.meta_index.items() if k in keys}
     if filename is not None:
@@ -379,3 +394,55 @@ def deprecation(prev_name: str, new_name: str, stacklevel: int = 3):
 
 def create_safe_id():
     return "_" + uuid.uuid4().hex
+
+
+def random_sampler(
+    tokens: List[Union[np.ndarray, List[str]]], sample_size: int, n_samples: int
+) -> Optional[np.ndarray]:
+    """Generates random samples from a list of lists of tokens.
+
+    :param tokens: A list of lists of tokens to sample from.
+    :param sample_size: The number of tokens to include in each sample.
+    :param n_samples: The number of samples to take.
+    :return: A `numpy.array`, where each row is a sample of tokens.
+    """
+    if not sample_size:
+        assert len(tokens) == 1
+        return np.tile(tokens[0], (n_samples, 1))
+
+    tokens_list = np.array([tokens_ for tokens_ in tokens if len(tokens_) >= sample_size])
+    if tokens_list.shape[0] == 0:
+        return None
+
+    rng = np.random.default_rng()
+    sample_idxs = rng.integers(0, tokens_list.shape[0], size=n_samples)
+    return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs])
+
+
+def create_temp_files(num_files: int) -> List[IO]:
+    """Creates a specified number of `tempfile` files.
+
+    :param num_files: The number of `tempfile` files to be created.
+    :return: A list of `tempfile.NamedTemporaryFile` files.
+    """
+    tmp_files = []
+    for _ in range(num_files):
+        tmp_files.append(tempfile.NamedTemporaryFile("w", delete=True))
+    return tmp_files
+
+
+def delete_files(tmp_filenames: List[str], remove_parent_dir: bool = True):
+    """Delete temporary files generated intermittently.
+
+    :param tmp_filenames: The filenames of all the files to be deleted.
+    :param remove_parent_dir: Indicator of whether the parent directory is to be deleted, if it is
+        empty after deleting all the temporary files, defaults to True.
+    """
+    tmp_filepaths = [Path(tmp_filename) for tmp_filename in tmp_filenames]
+    parent_dir = tmp_filepaths[0].parents[0]
+
+    for tmp_filepath in tmp_filepaths:
+        Path.unlink(tmp_filepath, missing_ok=True)
+
+    if remove_parent_dir and len(list(parent_dir.glob("*"))) == 0:
+        Path.rmdir(parent_dir)
diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst
index f92d29cc..0a001512 100644
--- a/docs/source/analysis.rst
+++ b/docs/source/analysis.rst
@@ -16,4 +16,5 @@ These are the transformers related to generating some analysis of the Corpus.
    Pairer <pairer.rst>
    PairedPrediction <pairedprediction.rst>
    Ranker <ranker.rst>
+   Surprise <surprise.rst>
    SpeakerConvoDiversity <speakerConvoDiversity.rst>
diff --git a/docs/source/language_model.rst b/docs/source/language_model.rst
new file mode 100644
index 00000000..f3b56c13
--- /dev/null
+++ b/docs/source/language_model.rst
@@ -0,0 +1,21 @@
+Language model
+==============
+
+Implements a language model and defines the `evaluate()` method, to perform
+language model evaluation by comparing the deviation of the target distribution
+from the context distribution (e.g., cross-entropy, perplexity).
+
+Base class
+----------
+
+.. automodule:: convokit.surprise.language_model
+    :members:
+
+Subclasses
+----------
+
+.. automodule:: convokit.surprise.convokit_lm
+    :members:
+
+.. automodule:: convokit.surprise.kenlm
+    :members:
diff --git a/docs/source/surprise.rst b/docs/source/surprise.rst
new file mode 100644
index 00000000..18bb23ad
--- /dev/null
+++ b/docs/source/surprise.rst
@@ -0,0 +1,24 @@
+Surprise
+========
+
+Implements the measure of how "surprising" conversations are (e.g., across users
+or within user conversations), thereby measuring users' language evolutions over
+time. For reference, see the `tie-breaker paper
+<https://www.cs.cornell.edu/~liye/tennis.html>`_.
+
+Example usage: `surprise demo
+<https://github.com/CornellNLP/ConvoKit/blob/master/convokit/surprise/demos/surprise_demo.ipynb>`_,
+`tennis demo
+<https://github.com/CornellNLP/ConvoKit/blob/master/convokit/surprise/demos/tennis_demo.ipynb>`_.
+
+.. automodule:: convokit.surprise.surprise
+    :members:
+    :private-members: _transform, _compute_surprise
+
+References
+----------
+
+.. toctree::
+   :maxdepth: 3
+
+   LanguageModel <language_model.rst>
diff --git a/setup.py b/setup.py
index c7b41e28..d6cfff81 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@
     ],
     extras_require={
         "craft": ["torch>=0.12"],
+        "kenlm": ["kenlm>=0.0.0"],
     },
     classifiers=[
         "Programming Language :: Python",