diff --git a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py index 34a8054a..f0d8fd01 100644 --- a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py +++ b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py @@ -126,6 +126,7 @@ def transform(self, corpus): corpus, "speaker", target_text_func=lambda utt: self._get_utt_row(utt, input_table).tokens, + smooth=False ) self._set_output(corpus, input_table) return corpus @@ -163,8 +164,7 @@ def _init_surprise(self, model_key_selector): surprise_attr_name=self.surprise_attr_name, target_sample_size=target_sample_size, context_sample_size=context_sample_size, - n_samples=n_samples, - smooth=False, + n_samples=n_samples ) def _get_text_func(self, utt: Utterance, df: pd.DataFrame): diff --git a/convokit/surprise/__init__.py b/convokit/surprise/__init__.py index d6d19d0c..57568901 100644 --- a/convokit/surprise/__init__.py +++ b/convokit/surprise/__init__.py @@ -1 +1,15 @@ +import importlib.util +import sys + +from .convokit_lm import * +from .language_model import * from .surprise import * + +if "kenlm" in sys.modules: + from .kenlm import * +elif (spec := importlib.util.find_spec("kenlm")) is not None: + module = importlib.util.module_from_spec(spec) + sys.modules["kenlm"] = module + spec.loader.exec_module(module) + + from .kenlm import * diff --git a/convokit/surprise/convokit_lm.py b/convokit/surprise/convokit_lm.py new file mode 100644 index 00000000..1b7ee5e5 --- /dev/null +++ b/convokit/surprise/convokit_lm.py @@ -0,0 +1,86 @@ +from collections import Counter +from typing import Optional, Any, Union, List + +import numpy as np + +from .language_model import LanguageModel + + +class ConvoKitLanguageModel(LanguageModel): + """A simple language model to compute the deviation of target from context. + + This language model implements cross-entropy and perplexity language model evaluation functions, + to be used in evaluating the average deviation of target from the specified context. + + :param model_type: The name (identifier) of the :py:class:`~convokit.ConvoKitLanguageModel`, + defaults to "convokit_lm". Note that the `model_type` can be accessed using the `type` + property (e.g., `lm.type`). + :param kwargs: Any additional keyword arguments needed in the language model evaluations. This + language model currently uses the following keyword arguments: + + * `smooth`: Indicator of using Laplace smoothing in the computation of cross-entropy scores, + defaults to `True`. + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized + with `joblib`, defaults to 1. + + The language model configuration can be retrieved using the `config` property of the model + class object (e.g., `lm.config`). + """ + + def __init__(self, model_type: str = "convokit_lm", **kwargs: Optional[Any]): + super().__init__(model_type, **kwargs) + + self._smooth = kwargs["smooth"] if "smooth" in kwargs else True + + def cross_entropy( + self, + target: Union[List[str], np.ndarray], + context: Union[List[str], np.ndarray], + ) -> float: + r"""Implements the base class method to compute the cross-entropy. + + Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. Note that we use the + natural logarithm; however, any base and corresponding exponent can be employed. For + instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). + + The smoothing boolean argument, `smooth`, is accessed from the setting in the language model + constructor (defaults to `True` when unspecified). + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :return: The cross-entropy score computed as :math:`H(P, Q)`. + """ + n_target, n_context = len(target), len(context) + if min(n_target, n_context) == 0: + return np.nan + + context_counts = Counter(context) + smooth_v = len(context_counts) + 1 if self._smooth else 0 + smooth_k = 1 if self._smooth else 0 + value = 0 if self._smooth else 1 + + return ( + sum( + -np.log((context_counts.get(token, value) + smooth_k) / (n_context + smooth_v)) + for token in target + ) + / n_target + ) + + def perplexity( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + r"""Implements the base class method to compute perplexity. + + Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. Note that + we use the natural logarithm; however, any base and corresponding exponent can be employed. + For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). + + For convenience, the perplexity score is computed as the exponentiation of the cross-entropy + calculated using the `cross_entropy()` method. + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :return: The perplexity score computed as :math:`\text{PPL}(P, Q)`. + """ + return np.exp(self.cross_entropy(target, context)) diff --git a/convokit/surprise/demos/surprise_demo.ipynb b/convokit/surprise/demos/surprise_demo.ipynb index 92946d6c..036fcdb5 100644 --- a/convokit/surprise/demos/surprise_demo.ipynb +++ b/convokit/surprise/demos/surprise_demo.ipynb @@ -11,17 +11,23 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "import convokit\n", "import itertools\n", + "\n", "import numpy as np\n", "import spacy\n", - "from convokit import Corpus, download, Surprise\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "import convokit\n", + "from convokit import Corpus, download\n", + "from convokit import Surprise, ConvoKitLanguageModel, Kenlm\n", "from convokit.text_processing import TextProcessor, TextParser\n", - "from sklearn.feature_extraction.text import CountVectorizer" + "\n", + "from tqdm.notebook import tqdm\n", + "import pprint as pp" ] }, { @@ -35,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "tags": [] }, @@ -44,7 +50,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dataset already exists at /home/axl4/.convokit/downloads/subreddit-Cornell\n" + "Dataset already exists at /Users/tushaar/.convokit/downloads/subreddit-Cornell\n" ] } ], @@ -54,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "tags": [] }, @@ -82,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -93,27 +99,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/axl4/ConvoKit/convokit/model/corpus.py:1213: FutureWarning: set_info() is deprecated and will be removed in a future release. Use add_meta() instead.\n", - "/home/axl4/ConvoKit/convokit/model/corpus.py:1219: FutureWarning: set_info() is deprecated and will be removed in a future release. Use add_meta() instead.\n" - ] - } - ], + "outputs": [], "source": [ "corpus.organize_speaker_convo_history(utterance_filter=utterance_is_valid)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -122,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -213,7 +210,7 @@ "Fencerman2 298.0" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +221,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -233,19 +230,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "\n", - "subset_utts = [list(corpus.get_speaker(speaker).iter_utterances(selector=utterance_is_valid)) for speaker in top_speakers]\n", + "subset_utts = [list(corpus.get_speaker(speaker).iter_utterances(selector=utterance_is_valid)) \n", + " for speaker in top_speakers]\n", "subset_corpus = Corpus(utterances=list(itertools.chain(*subset_utts)))" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "tags": [] }, @@ -276,47 +274,43 @@ "\n", "The transformer also has an optional `tokenizer` parameter to customize tokenization. Here we will tokenize the text outside of the surprise transformer, so our tokenizer will be an identity function.\n", "\n", - "The `smooth` parameter determines whether the transformer uses +1 laplace smoothing (`smooth = True`) or naively replaces 0 counts with 1's as the SpeakerConvoDiversity transformer does (`smooth = False`)." + "" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "\n", "spacy_nlp = spacy.load('en_core_web_sm', disable=['ner','parser', 'tagger', 'lemmatizer'])\n", - "for utt in subset_corpus.iter_utterances():\n", + "for utt in tqdm(subset_corpus.iter_utterances()):\n", " utt.meta['joined_tokens'] = [t.text.lower() for t in spacy_nlp(utt.text)]" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "surp = Surprise(tokenizer=lambda x: x, model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), target_sample_size=100, context_sample_size=1000, n_samples=50, smooth=True)" + "surp = Surprise(tokenizer=lambda x: x, \n", + " model_key_selector=lambda utt: '_'.join([utt.speaker.id, utt.conversation_id]), \n", + " target_sample_size=100, context_sample_size=1000, n_samples=50, n_jobs=8)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "fit1: 20550it [00:16, 1283.44it/s]\n", - "fit2: 100%|██████████| 15394/15394 [00:00<00:00, 1032033.56it/s]\n" - ] - } - ], + "outputs": [], "source": [ - "surp = surp.fit(subset_corpus, text_func=lambda utt: [list(itertools.chain(*[u.meta['joined_tokens'] for u in utt.speaker.iter_utterances() if u.conversation_id != utt.conversation_id]))])" + "surp = surp.fit(subset_corpus, \n", + " text_func=lambda utt: [list(itertools.chain(*[u.meta['joined_tokens'] \n", + " for u in utt.speaker.iter_utterances() \n", + " if u.conversation_id != utt.conversation_id]))])" ] }, { @@ -330,21 +324,61 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "tags": [] - }, + "execution_count": 19, + "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "transform: 100it [15:57, 9.57s/it]\n" + "{'model_type': 'convokit_lm', 'n_jobs': 8, 'smooth': True}\n" ] } ], "source": [ - "transformed_corpus = surp.transform(subset_corpus, obj_type='speaker')" + "convokit_lm = ConvoKitLanguageModel(n_jobs=8)\n", + "pp.pprint(convokit_lm.config)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'is_persistent': False,\n", + " 'kenlm_path': '/Users/tushaar/kenlm',\n", + " 'model_filename': 'kenlm_surprise',\n", + " 'model_type': 'kenlm',\n", + " 'models_dir': '/Users/tushaar/Desktop/kenlm_models',\n", + " 'n_jobs': 8,\n", + " 'ngram_order': 2}\n" + ] + } + ], + "source": [ + "# Replace with appropriate paths to your kenlm directory\n", + "# and the folder to save the models.\n", + "kenlm = Kenlm(kenlm_path='/Users/tushaar/kenlm', \n", + " models_dir='/Users/tushaar/Desktop/kenlm_models', \n", + " model_filename='kenlm_surprise', \n", + " n_jobs=8)\n", + "pp.pprint(kenlm.config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "transformed_corpus = surp.transform(subset_corpus, obj_type='speaker', \n", + " language_model=convokit_lm, eval_type='cross_entropy')" ] }, { @@ -357,7 +391,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -386,26 +420,26 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "EQUASHNZRKUL_815y6t 7.233156\n", - "SwissWatchesOnly_8g5q88 7.216094\n", - "SwissWatchesOnly_67cljd 7.129933\n", - "EQUASHNZRKUL_73xuw6 7.114335\n", - "Straight_Derpin_5kst5l 7.067594\n", - "laveritecestla_6v4ysm 7.066840\n", - "ClawofBeta_52u1nu 7.059744\n", - "Udontlikecake_7rj6a0 7.053087\n", - "syntheticity_97zg9z 7.041747\n", - "DEEP_THORAX_8drwet 7.038059\n", + "EQUASHNZRKUL_815y6t 7.258089\n", + "SwissWatchesOnly_8g5q88 7.199374\n", + "SwissWatchesOnly_67cljd 7.125212\n", + "EQUASHNZRKUL_73xuw6 7.100619\n", + "Udontlikecake_7rj6a0 7.083575\n", + "ClawofBeta_52u1nu 7.081842\n", + "Straight_Derpin_5kst5l 7.080008\n", + "syntheticity_97zg9z 7.055642\n", + "CornellMan333_9iwucv 7.043682\n", + "t3hasiangod_42k6wa 7.040483\n", "dtype: float64" ] }, - "execution_count": 16, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -424,26 +458,26 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Unga_Bunga_30ac0l 5.841967\n", - "Bisphosphate_7r8nu1 5.941750\n", - "crash_over-ride_6bjxnm 5.945221\n", - "crash_over-ride_8f7b0y 5.962945\n", - "crash_over-ride_7owfvv 5.963205\n", - "crash_over-ride_30zba1 5.970271\n", - "crash_over-ride_2vhtzx 5.970866\n", - "crash_over-ride_t6w01 5.981621\n", - "omgdonerkebab_v4a3p 5.981898\n", - "crash_over-ride_9b132c 5.983570\n", + "Unga_Bunga_30ac0l 5.849274\n", + "crash_over-ride_30zba1 5.937072\n", + "omgdonerkebab_v4a3p 5.944469\n", + "Bisphosphate_7r8nu1 5.960513\n", + "crash_over-ride_t6w01 5.962633\n", + "crash_over-ride_6bjxnm 5.967824\n", + "crash_over-ride_v4j70 5.980576\n", + "crash_over-ride_2vhtzx 5.982879\n", + "crash_over-ride_8f7b0y 5.990480\n", + "crash_over-ride_9b132c 6.002238\n", "dtype: float64" ] }, - "execution_count": 17, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -470,7 +504,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -484,7 +518,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/convokit/surprise/demos/tennis_demo.ipynb b/convokit/surprise/demos/tennis_demo.ipynb index a5012807..f352404c 100644 --- a/convokit/surprise/demos/tennis_demo.ipynb +++ b/convokit/surprise/demos/tennis_demo.ipynb @@ -10,16 +10,22 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "import convokit\n", "import json\n", + "\n", "import numpy as np\n", "from collections import defaultdict\n", - "from convokit import Corpus, Speaker, Utterance, download, Surprise\n", - "from tqdm import tqdm" + "from nltk.tokenize import word_tokenize, sent_tokenize\n", + "\n", + "import convokit\n", + "from convokit import Surprise, ConvoKitLanguageModel, Kenlm\n", + "from convokit import Corpus, Speaker, Utterance, download\n", + "\n", + "from tqdm.notebook import tqdm\n", + "import pprint as pp" ] }, { @@ -32,17 +38,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "PATH = '/home/axl4' # replace with your path to tennis_data directory\n", + "PATH = '../../../../examples' # replace with your path to tennis_data directory\n", "data_dir = f'{PATH}/tennis_data/'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -61,15 +67,22 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 3962/3962 [00:00<00:00, 267184.91it/s]\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7ef5f528cab74142aca4e45705e3e631", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3962 [00:00" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "surp.fit(game_commentary_corpus, text_func=lambda utt: [' '.join([u.text for u in game_commentary_corpus.iter_utterances()])])" + "surp = surp.fit(game_commentary_corpus, \n", + " text_func=lambda utt: [' '.join([u.text for u in game_commentary_corpus.iter_utterances()])])" ] }, { @@ -219,16 +223,19 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "\n", "SAMPLE = True\n", - "SAMPLE_SIZE = 10000 # edit this to change the number of interview questions to calculate surprise for\n", + "SAMPLE_SIZE = 500 # edit this to change the number of interview questions to calculate surprise for\n", "\n", - "subset_utts = [interview_corpus.get_utterance(utt) for utt in interview_corpus.get_utterances_dataframe(selector=lambda utt: utt.meta['is_question']).sample(SAMPLE_SIZE).index]\n", + "subset_utts = \\\n", + " [interview_corpus.get_utterance(utt)\n", + " for utt in interview_corpus.get_utterances_dataframe(selector=lambda utt: \n", + " utt.meta['is_question']).sample(SAMPLE_SIZE).index]\n", "subset_corpus = Corpus(utterances=subset_utts) if SAMPLE else interview_corpus" ] }, @@ -241,29 +248,41 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "transform: 10000it [31:05, 5.36it/s]\n" + "{'is_persistent': True,\n", + " 'kenlm_path': '/Users/tushaar/kenlm',\n", + " 'model_filename': 'kenlm_surprise',\n", + " 'model_type': 'kenlm',\n", + " 'models_dir': '../../../../examples/kenlm_models',\n", + " 'n_jobs': 1,\n", + " 'ngram_order': 2}\n" ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "surp.transform(subset_corpus, obj_type='utterance', selector=lambda utt: utt.meta['is_question'])" + "# Replace with appropriate paths to your kenlm directory.\n", + "kenlm = Kenlm(kenlm_path='/Users/tushaar/kenlm', \n", + " models_dir=f'{PATH}/kenlm_models', \n", + " model_filename='kenlm_surprise', \n", + " is_persistent=True)\n", + "pp.pprint(kenlm.config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subset_corpus = surp.transform(subset_corpus, obj_type='utterance',\n", + " selector=lambda utt: utt.meta['is_question'], \n", + " language_model=kenlm, eval_type='cross_entropy')" ] }, { @@ -276,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -285,45 +304,46 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7.1372781396723255" + "36.832740783691406" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", + "get_scores = lambda utterances: pd.Series([score['corpus']for score in utterances], index=utterances.index)\n", "\n", - "female_qs = pd.to_numeric(utterances[utterances['meta.player_gender'] == 'F']['meta.surprise']).dropna()\n", + "female_qs = get_scores(utterances[utterances['meta.player_gender'] == 'F']['meta.surprise']).dropna()\n", "female_qs.median()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7.147981123495766" + "37.093317667643234" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "male_qs = pd.to_numeric(utterances[utterances['meta.player_gender'] == 'M']['meta.surprise']).dropna()\n", + "male_qs = get_scores(utterances[utterances['meta.player_gender'] == 'M']['meta.surprise']).dropna()\n", "male_qs.median()" ] }, @@ -343,7 +363,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -353,18 +373,18 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "And when was that in the match? The first set? Second set?\n", - "When she broke you in the eighth game of the third set, she did a backhand off the net and it kind of clipped the net and you kind of netted the next one. Was that just a tough break?\n", - "You started 3Love down in the first set. You came back and won it 64. What was the turnaround for you in the opening set and on through the match?\n", - "Would you give her a good chance against Stosur in the next round?\n", - "Do you enjoy the balance of the life as a tour player and then back home in and the ability to serve your country in the military?\n" + "What was it like on court? When did you sense that she was vulnerable, beatable?\n", + "Did you advise Serena not to play there?\n", + "Were you aware of Richard and Venus coming in for the third set? You seemed to look to them after a couple points and showed real emotion.\n", + "Can you describe how disappointed you are right now.\n", + "In the beginning, you were almost down 4Love. Why the slow start?\n" ] } ], @@ -375,18 +395,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "And the second serve on the set point in the fourth set, just another day at the office?\n", - "Was it a big advantage to serve first in the third set?\n", - "But at the start of the third set again you had a little bit of a...\n", - "Speaking of the mental game, much is made of being the hunter or the hunted. For so long you were the hunted. This is the first week in a long time being the hunter. Is there a change at all in you?\n", - "How big of a deal was it get that break in the first game of the second set?\n" + "Why? What happened? What went wrong?\n", + "Is that your best result playing on grass? You had a good Wimbledon a couple years ago.\n", + "You seemed to have a good rhythm before the second rain break. What was the effect of the roof coming across? Did it feel different? Do you think it benefited him?\n", + "How is it different to play Roger on grass? You have played him on every surface. How is it different to play him here?\n", + "Why was he giving you so much trouble to start with? Was the problem with you or him or both?\n" ] } ], @@ -397,18 +417,18 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No yoga, you prefer to dance? Some players do yoga.\n", - "What aspects of the match do you think were decisive, technically speaking?\n", - "Did you hear the birds? They were really crying. They were trapped and --\n", - "Did Sasha get an invitation to Kris Humphries' wedding this weekend?\n", - "Are you primarily based in Southern California or South Florida now?\n" + "The WTA has announces that next year your dad could come out with a tablet that shows stats. Do you think that your father would like having that access, and could you imagine what that would be like?\n", + "She's on an incredible run. Can you assess her as an upandcoming player? Is she somebody who can be an elite player in this game?\n", + "You've played Grand Slams before against players who were in their own country and been okay. Why the nerves? You've been in great form, haven't lost in a while.\n", + "When you look around the locker room you must feel like the senior citizen.\n", + "I forgot that you mentioned you're having your 27th birthday this fall.\n" ] } ], @@ -419,18 +439,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Are you planning to play tactically against James or Mathieu tomorrow?\n", - "Did you consider yourself a streaky player even in college?\n", - "You said you watched Scream last night to relax. Do you normally watch horror films to relax?\n", - "How do you view your secondround matchup with Bernard Tomic?\n", - "Just talk us through the messages on your kit bag.\n" + "Your friend Tiger Woods was here in Doral last weekend. Did you get a chance to cross paths with him or talk to him prior to this tournament?\n", + "You've started the year at the Australian Open the last couple years and done very well. Do you feel like this is a Major tournament to you that will make a big difference in your career here this week?\n", + "You'll obviously go home with happy memories, despite what's happened today.\n", + "Have you changed anything in your clay court preparations from last year?\n", + "First match out here morning after a Davis Cup stint and traveling, how did you feel? Talk about the match.\n" ] } ], @@ -450,39 +470,52 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "gender_models_surp = Surprise(model_key_selector=lambda utt: utt.meta['player_gender'], target_sample_size=10, context_sample_size=5000, surprise_attr_name='surprise_gender_model')" + "gender_models_surp = Surprise(model_key_selector=lambda utt: utt.meta['player_gender'],\n", + " target_sample_size=10, context_sample_size=5000,\n", + " surprise_attr_name='surprise_gender_model')" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "fit1: 81974it [00:00, 302952.81it/s]\n", - "fit2: 100%|██████████| 2/2 [00:12<00:00, 6.31s/it]\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3024b19405004768ba07bcd599b33e28", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "fit: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "489621b89e8b4e6e9766b690bd24f825", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "" + "fit: 0%| | 0/2 [00:00" + "surprise: 100%|##########| 1/1 [00:05<00:00, 5.21s/it]" ] }, - "execution_count": 24, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.20s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.02s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.13s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.16s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.52s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.04s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "surprise: 100%|##########| 1/1 [00:02<00:00, 2.14s/it]" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "gender_models_surp.transform(subset_corpus, obj_type='utterance', group_and_models=lambda utt: (utt.id, ['M', 'F']), group_model_attr_key=lambda _, m: m, selector=lambda utt: utt.meta['is_question'])" + "subset_corpus = \\\n", + " gender_models_surp.transform(subset_corpus, obj_type='utterance', \n", + " group_and_models=lambda utt: (utt.id, ['M', 'F']), \n", + " group_model_attr_key=lambda _, m: m,\n", + " selector=lambda utt: utt.meta['is_question'], \n", + " language_model=convokit_lm, eval_type='cross_entropy')" ] }, { @@ -531,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -540,82 +705,86 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.78670861966856" + "5.804742348431906" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'F']['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'F'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.7477053372750335" + "5.762531083154594" ] }, - "execution_count": 27, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'F']['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'F'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.784562889828235" + "5.7774629531902235" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'M']['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'M'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['M']).dropna().mean()" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5.81045743833415" + "5.820980867869622" ] }, - "execution_count": 29, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "utterances[utterances['meta.player_gender'] == 'M']['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" + "utterances[utterances['meta.player_gender'] == 'M'] \\\n", + " ['meta.surprise_gender_model'].map(lambda x: x['F']).dropna().mean()" ] }, { @@ -628,7 +797,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -642,7 +811,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/convokit/surprise/kenlm.py b/convokit/surprise/kenlm.py new file mode 100644 index 00000000..23e71cfc --- /dev/null +++ b/convokit/surprise/kenlm.py @@ -0,0 +1,246 @@ +import os +import subprocess +import time +import warnings +from pathlib import Path +from typing import Optional, Any, Union, List, Tuple + +import numpy as np + +from convokit.util import create_temp_files, delete_files +from .language_model import LanguageModel + +try: + import kenlm +except (ModuleNotFoundError, ImportError): + raise ModuleNotFoundError( + "kenlm is not currently installed; run `pip install convokit[kenlm]` if you " + "would like to use the Kenlm language model. If kenlm installation fails, please " + "follow: https://github.com/kpu/kenlm/issues/57 to install kenlm." + ) + + +class Kenlm(LanguageModel): + """A language model to compute the deviation of target from context using KenLM. + + Using KenLM library, this language model implements cross-entropy and perplexity language model + evaluation functions, to be used in evaluating the average deviation of target text from the + specified context. + + Run `pip install convokit[kenlm]` to install the KenLM library before using this language model + class. If kenlm installation fails, please follow: https://github.com/kpu/kenlm/issues/57 to + install the KenLM library. + + :param model_type: The name of the :py:class:`~convokit.Kenlm`, defaults to "kenlm". Note that + the `model_type` can be accessed using the `type` property (e.g., `lm.type`). + :param kwargs: Any additional keyword arguments needed in the language model evaluations. This + language model currently uses the following keyword arguments: + + * `ngram_order`: The order of n-gram language model, when the specified `ngram_order` is + less than 2 (or unspecified), the `ngram_order` is set to 2, since the KenLM library does + not support n-gram order below 2 (see: https://github.com/kpu/kenlm/issues/171). + * `trained_model_filepath`: The filepath to a pre-trained language model that is to be + persistently used. + * `is_persistent`: Indicator of model persistence, i.e., the model generated in the first + pass or that loaded from `trained_model_filepath` is used in all evaluations. When the + `trained_model_filepath` is specified, persistence is implied. Defaults to `False`. + * `kenlm_path`: The path to the KenLM library, defaults to the user's home directory. + * `models_dir`: The folder path to store the (trained) binary KenLM models, defaults to + `None`, indicating that the trained KenLM models need not be stored. + * `model_filename`: The filename used in storing model artefacts, defaults to `model_type`. + * `n_jobs`: The number of concurrent threads to be used for routines that are parallelized + with `joblib`, defaults to 1. + + The language model configuration can be retrieved using the `config` property of the model + class object (e.g., `lm.config`). + """ + + def __init__(self, model_type: str = "kenlm", **kwargs: Optional[Any]): + super().__init__(model_type, **kwargs) + + self._ngram_order = kwargs["ngram_order"] if "ngram_order" in kwargs else 2 + if self._ngram_order < 2: + warnings.warn( + f"kenlm does not support n-gram order below 2; setting n-gram order to 2. " + f"See: https://github.com/kpu/kenlm/issues/171 for specifics." + ) + self._ngram_order = 2 + + self._is_persistent = kwargs["is_persistent"] if "is_persistent" in kwargs else False + if self._is_persistent or "trained_model_filepath" in kwargs: + self._is_persistent = True + self.__kenlm_model = ( + Kenlm.load_kenlm_from_file(kwargs["trained_model_filepath"]) + if "trained_model_filepath" in kwargs + else None + ) + + if "kenlm_path" not in kwargs: + self._kenlm_path = os.path.join(str(Path.home()), "kenlm") + warnings.warn(f"the kenlm_path is unspecified, setting it to {self._kenlm_path}") + self.__kenlm_bin_path = os.path.join(self._kenlm_path, "build/bin") + if not os.path.isdir(self.__kenlm_bin_path): + raise FileNotFoundError( + f"the build directory for kenlm does not exist at: {self.__kenlm_bin_path}; " + f"build kenlm {self._kenlm_path} before computing surprise scores" + ) + + self._models_dir = kwargs["models_dir"] if "models_dir" in kwargs else None + if self._models_dir and not os.path.exists(self._models_dir): + warnings.warn(f"creating the folder: {self._models_dir} as it does not exist") + os.makedirs(self._models_dir) + self._model_filename = ( + kwargs["model_filename"] if "model_filename" in kwargs else self._model_type + ) + + @staticmethod + def load_kenlm_from_file(trained_model_filepath: str) -> kenlm.Model: + """Loads the pre-trained KenLM model from the specified filepath. + + :param trained_model_filepath: The path to the pre-trained KenLM model. + :return: The loaded KenLM model. + """ + kenlm_model = kenlm.Model(trained_model_filepath) + return kenlm_model + + def __make_files(self) -> Tuple[str, str, str]: + """Create (if needed) and return the filenames of intermittent files. + + KenLM language model needs the training data filename, .arpa filename, and the binary model + filename to generate a KenLM model. If the models are not stored (specified through the + argument `models_dir` in the constructor), `tempfile` files are used, else, all the files + are generated in the `models_dir/current_timestamp` folder, using the filename specified in + the constructor. + + :return: A tuple of filenames of all the intermittent files needed. + """ + if self._models_dir: + epoch = str(int(time.time())) + os.makedirs(os.path.join(self._models_dir, epoch)) + + train_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.txt") + arpa_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.arpa") + model_filename = os.path.join(self._models_dir, epoch, f"{self._model_filename}.bin") + else: + train_file, arpa_file, model_file = create_temp_files(num_files=3) + train_filename, arpa_filename, model_filename = ( + train_file.name, + arpa_file.name, + model_file.name, + ) + return train_filename, arpa_filename, model_filename + + @staticmethod + def __populate_train_file(filepath: str, samples: Union[List[List[str]], np.ndarray]): + """Writes the specified samples to a file, to be used in KenLM training. + + :param filepath: The filepath to write the samples to. + :param samples: The samples that are to be written to the file. Each list of samples is + delimited using a newline (`\n`). + """ + with open(filepath, "w", encoding="utf-8") as f: + for sample in samples: + f.write(f'{" ".join(sample).strip()}\n') + + def _get_kenlm_model(self, context_samples: Union[List[List[str]], np.ndarray]) -> kenlm.Model: + """Retrieve the KenLM model trained using the specified `context_samples`. + + This method generates the training file using the `context_samples`, which is then used in + the generation of the .arpa and a binary KenLM trained model files. These intermittent files + are deleted, unless the specified value of `models_dir` is not `None`, indicating that the + models are to be stored. + + :param context_samples: The context samples to be used in training the KenLM model. + :return: The KenLM model trained on the specified `context_samples`. + """ + train_filename, arpa_filename, model_filename = self.__make_files() + + self.__populate_train_file(train_filename, samples=context_samples) + kenlm_args = [ + os.path.join(self.__kenlm_bin_path, "lmplz"), + "-o", + f"{self._ngram_order}", + "--text", + train_filename, + "--arpa", + arpa_filename, + "--discount_fallback", + ] + cmd_return = subprocess.run( + kenlm_args, + capture_output=False, + text=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + if cmd_return.returncode != 0: + delete_files([model_filename, arpa_filename, train_filename]) + raise RuntimeError("the kenlm model training was unsuccessful") + + kenlm_args = [ + os.path.join(self.__kenlm_bin_path, "build_binary"), + "trie", + arpa_filename, + model_filename, + ] + cmd_return = subprocess.run( + kenlm_args, + capture_output=False, + text=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + if cmd_return.returncode != 0: + delete_files([model_filename, arpa_filename, train_filename]) + raise RuntimeError("the kenlm model (binary) building was unsuccessful") + + kenlm_model = kenlm.Model(model_filename) + if not self._models_dir: + delete_files([model_filename, arpa_filename, train_filename]) + + return kenlm_model + + def cross_entropy( + self, + target: Union[List[str], np.ndarray], + context: Union[List[str], np.ndarray], + ) -> float: + """Implements the base class method to compute the cross-entropy. + + A KenLM model is trained using the specified `context`, and is used to evaluate the `target` + text. Note that, if model persistence is indicated in the constructor (using the argument + `is_persistent`), the model generated in the first pass or that loaded from the parameter + value of `trained_model_filepath` is used in all evaluations. (When `trained_model_filepath` + is specified, persistence is automatically implied.) + + The KenLM library returns a score of log-probabilities (when `score()` method is used), and + the cross-entropy is the negative log-likelihood. + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q), used to train the model. + :return: The cross-entropy score computed using the `kenlm.score()` method. + """ + if self.__kenlm_model is None or not self._is_persistent: + self.__kenlm_model = self._get_kenlm_model([context]) + return -self.__kenlm_model.score(" ".join(target).strip()) + + def perplexity( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + """Implements the base class method to compute perplexity. + + A KenLM model is trained using the specified `context`, and is used to evaluate the `target` + text. Note that, if model persistence is indicated in the constructor (using the argument + `is_persistent`), the model generated in the first pass or that loaded from the parameter + value of `trained_model_filepath` is used in all evaluations. (When `trained_model_filepath` + is specified, persistence is automatically implied.) + + The KenLM library returns a perplexity score, with the use of `kenlm.perplexity()` method. + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q), used to train the model. + :return: The perplexity score computed using the `kenlm.perplexity()` method. + """ + if self.__kenlm_model is None or not self._is_persistent: + self.__kenlm_model = self._get_kenlm_model([context]) + return self.__kenlm_model.perplexity(" ".join(target).strip()) diff --git a/convokit/surprise/language_model.py b/convokit/surprise/language_model.py new file mode 100644 index 00000000..d931cce4 --- /dev/null +++ b/convokit/surprise/language_model.py @@ -0,0 +1,167 @@ +from abc import ABC +from typing import Optional, Any, List, Dict, Union, final + +import numpy as np +from joblib import Parallel, delayed + + +class LanguageModel(ABC): + """The abstract base class for all language models. + + The language model base class defines the :py:meth:`~convokit.LanguageModel.evaluate` method, + which performs language model evaluation using the `eval_type` specified as an argument to the + :py:meth:`~convokit.LanguageModel.evaluate` method. Note that this method must be defined and + implemented in the subclass (e.g., if the `eval_type` is set to "cross_entropy", the subclass + must implement :py:meth:`~convokit.LanguageModel.cross_entropy` method). The implemented method + should take in a list of target tokens and a list of context tokens, and output the language + model evaluation score. + + Since most language models employs cross-entropy and perplexity evaluations, this base class + includes unimplemented designs of :py:meth:`~convokit.LanguageModel.cross_entropy` and + :py:meth:`~convokit.LanguageModel.perplexity` functions, which may be implemented (as needed) in + the subclasses. See the subclass implementations: :py:class:`~convokit.ConvoKitLanguageModel` + and :py:class:`~convokit.Kenlm` classes, which extend this base class. + + The :py:meth:`~convokit.LanguageModel.evaluate` method defined in this class is called on a set + of context samples and a set of target samples, and evaluates the target-context distribution + deviations using the `eval_type` language model evaluation function. + + Note: The subclasses cannot override the :py:meth:`~convokit.LanguageModel.evaluate` method. + + :param model_type: The name (identifier) of :py:class:`~convokit.LanguageModel`, defaults to + "language_model". Note that the `model_type` can be accessed using the `type` property + (e.g., `lm.type`). + :param kwargs: Any additional keyword arguments needed in the language model evaluations. For + instance, the cross-entropy computes might require smoothing parameter; hence, a `smooth` + parameter can be passed as an additional keyword argument. + Another keyword argument is `n_jobs`, used to specify the number of concurrent threads to be + used for routines that are parallelized with `joblib`, defaults to 1. + The language model configuration can be retrieved using the `config` property of the model + class object (e.g., `lm.config`). + """ + + def __init__(self, model_type: str = "language_model", **kwargs: Optional[Any]): + self._model_type = model_type + self._n_jobs = kwargs["n_jobs"] if "n_jobs" in kwargs else 1 + + self.__dict__.update((f"_{arg}", value) for arg, value in kwargs.items()) + + @property + def type(self) -> str: + """The `model_type` property of the language model. + + :return: The `model_type` specified in the class constructor, defaults to "language_model". + """ + return self._model_type + + @property + def config(self) -> Dict[str, Any]: + """The configuration (all the class parameters) of the language model. + + :return: The configuration (all the class parameters specified in the class constructor and + elsewhere) of the language model. + """ + private_var_prefix = f"_{self.__class__.__name__}" + return { + arg[1:]: value + for arg, value in self.__dict__.items() + if not arg.startswith(private_var_prefix) + } + + def _overwrite_args(self, args_to_overwrite: List[str], kwargs: Dict[str, Any]): + """Overwrites the class variables with the values specified in `kwargs`. + + :param args_to_overwrite: The list of arguments (class variable names) whose values are to + be overwritten using the values in the `kwargs`. + :param kwargs: The keyword arguments with updates to the values of the class variables. + """ + for arg in args_to_overwrite: + self.__dict__[f"_{arg}"] = kwargs[arg] if arg in kwargs else self.__dict__[f"_{arg}"] + + def cross_entropy( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + r"""An unimplemented base class method to compute the cross-entropy. + + The cross-entropy between a list of target tokens and a list of context tokens is to be + computed by the implementation in the subclass. Note that any variables to be used in this + method (e.g., smoothing value) must be accessed from the class scope. + + Calculates :math:`H(P, Q) = -\sum_{x \in X}P(x) \times \ln(Q(x))`. + + Note that we use the natural logarithm; however, any base and corresponding exponent can be + employed. For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :raises: Raises a `RuntimeError` if called without implementing it in the subclass. + """ + raise RuntimeError("cross entropy is not implemented") + + def perplexity( + self, target: Union[List[str], np.ndarray], context: Union[List[str], np.ndarray] + ) -> float: + r"""An unimplemented base class method to compute perplexity. + + The perplexity between a list of target tokens and a list of context tokens is to be + computed by the implementation in the subclass. Note that any variables to be used in this + method (e.g., smoothing value) must be accessed from the class scope. + + Calculates :math:`\text{PPL}(P, Q) = \exp(-\sum_{x \in X}P(x) \times \ln(Q(x)))`. + + Note that we use the natural logarithm; however, any base and corresponding exponent can be + employed. For instance, KenLM uses base-10 (see :py:class:`~convokit.Kenlm` for reference). + + :param target: A list of tokens that make up the target text (P). + :param context: A list of tokens that make up the context text (Q). + :raises: Raises a `RuntimeError` if called without implementing it in the subclass. + """ + raise RuntimeError("perplexity is not implemented") + + @final + def evaluate( + self, + target_samples: Union[List[List[str]], np.ndarray], + context_samples: Union[List[List[str]], np.ndarray], + eval_type: str = "cross_entropy", + **kwargs: Optional[Any], + ) -> np.ndarray: + """Computes the average deviation between target and context distributions. + + For a given list of (fixed size) target sample lists and (fixed size) context sample lists, + the :py:meth:`~convokit.LanguageModel.evaluate` method computes the deviation between each + target and corresponding context pair, using `eval_type` language model evaluation metric. + Note that the subclass implementing this abstract base class must define and implement the + `eval_type` evaluation method. The final score output by this method is an average of all + the individual scores. + + Also note that, if specified as keyword arguments, any class variable values are overwritten + from within this method. + + :param target_samples: A list of target sample lists to be used to evaluate against the + corresponding context sample lists. + :param context_samples: A list of context sample lists that are to be used in evaluating the + corresponding target sample lists. + :param eval_type: The language model evaluation function (as `str`), used in evaluating the + language model trained using the context text, evaluated using the target text. Defaults + to "cross_entropy", i.e., calls the :py:meth:`~convokit.LanguageModel.cross_entropy` + method. + :param kwargs: Any additional keyword arguments needed in the language model evaluations. If + any class variables are passed using `kwargs`, the corresponding class variable values + are overwritten using the new values. + :return: The average score that measures the average deviation of target text from context. + """ + self._overwrite_args(list(kwargs.keys()), kwargs) + eval_fn = getattr(self, eval_type) + + if self._n_jobs == 1: + model_scores = [ + eval_fn(target_sample, context_sample) + for target_sample, context_sample in zip(target_samples, context_samples) + ] + else: + model_scores = Parallel(n_jobs=self._n_jobs, backend="threading")( + delayed(eval_fn)(target_sample, context_sample) + for target_sample, context_sample in zip(target_samples, context_samples) + ) + return np.nanmean(model_scores) diff --git a/convokit/surprise/surprise.py b/convokit/surprise/surprise.py index b03ce3de..73e187d1 100644 --- a/convokit/surprise/surprise.py +++ b/convokit/surprise/surprise.py @@ -1,272 +1,458 @@ -import numpy as np -from collections import defaultdict, Counter -from convokit import Transformer -from convokit.model import Corpus, CorpusComponent, Utterance +import warnings +from collections import defaultdict from itertools import chain +from typing import Callable, List, Tuple, Dict, Any, Optional, Union, Set + +import numpy as np +from IPython import get_ipython +from joblib import Parallel, delayed from nltk.tokenize import word_tokenize -from sklearn.feature_extraction.text import CountVectorizer from tqdm import tqdm -from typing import Callable, List, Tuple, Union - -def _cross_entropy(target: List[str], context: List[str], smooth=True): - """ - Calculates H(P,Q) = -sum_{x\in X}(P(x) * log(Q(x))) - - :param target: list of tokens that make up the target text (P) - :param context: list of tokens that make up the context (Q) - :param smooth: whether to use add 1 smoothing for OOV tokens +from convokit import Transformer +from convokit.model import Corpus, Utterance, CorpusComponent +from convokit.util import random_sampler +from .convokit_lm import ConvoKitLanguageModel - :return: cross entropy - """ - N_target, N_context = len(target), len(context) - if min(N_target, N_context) == 0: - return np.nan - context_counts = Counter(context) - V = len(context_counts) + 1 if smooth else 0 - k = 1 if smooth else 0 - val = 0 if smooth else 1 - return ( - sum(-np.log((context_counts.get(tok, val) + k) / (N_context + V)) for tok in target) - / N_target - ) - - -def sample(tokens: List[Union[np.ndarray, List[str]]], sample_size: int, n_samples=50, p=None): - """ - Generates random samples from a list of lists of tokens. +try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell" or shell == "TerminalInteractiveShell": + from tqdm.notebook import tqdm +except (NameError, ModuleNotFoundError, ImportError): + pass - :param toks: a list of lists of tokens to sample from. - :param sample_size: the number of tokens to include in each sample. - :param n_samples: the number of samples to take. - :return: numpy array where each row is a sample of tokens - """ - if not sample_size: - assert len(tokens) == 1 - return np.tile(tokens[0], (n_samples, 1)) - tokens_list = np.array([toks for toks in tokens if len(toks) >= sample_size]) - if tokens_list.shape[0] == 0: - return None - rng = np.random.default_rng() - sample_idxes = rng.integers(0, tokens_list.shape[0], size=(n_samples)) - return np.array([rng.choice(tokens_list[i], sample_size) for i in sample_idxes]) +class Surprise(Transformer): + """Measures the amount of "surprise" between target and context utterance(s). + This transformer computes how surprising a target utterance or group of utterances is, when + compared to some context. The amount of "surprise" is measured by comparing the deviation + of the target distribution from the context distribution (e.g., cross-entropy, perplexity). + Furthermore, to mitigate the effects of text length on language model evaluation, the surprise + transformer uses several random fixed length samples from target and context text. -class Surprise(Transformer): - """ - Computes how surprising a target (an utterance or group of utterances) is based on some context. - The measure for surprise used is cross entropy. Uses fixed size samples from target and context text - to mitigate effects of length on cross entropy. - - :param model_key_selector: function that defines how utterances should be mapped to models. - Takes in an utterance and returns the key to use for mapping the utterance to a corresponding model. - :param tokenize: optional function that takes in a string and returns a list of tokens in that string. - default: nltk's word_tokenize - :param surprise_attr_name: the name for the metadata attribute to add to objects. - default: surprise - :param target_sample_size: number of tokens to sample from each target (test text). If `None`, then the entire target will be used. - :param context_sample_size: number of tokens to sample from each context (training text). If `None`, then the entire context will be used. - :param n_samples: number of samples to take for each target-context pair. - :param sampling_fn: function for generating samples of tokens. - :param smooth: whether to use laplace smoothing when calculating surprise. + :param model_key_selector: A function that specifies how utterances are to be mapped to models. + The function takes in an utterance and returns the key to use in mapping the utterance to a + corresponding model. + :param tokenizer: A function that returns a list of tokens in a given string, defaults to + `nltk.word_tokenize`. + :param surprise_attr_name: The name for the metadata attribute to add to the objects, defaults + to "surprise". + :param target_sample_size: The number of tokens to sample from each target (test text); when + specified as `None`, then the entire target will be used, defaults to 100. + :param context_sample_size: The number of tokens to sample from each context (training text); + when specified as `None`, then the entire context will be used, defaults to 100. + :param n_samples: The number of samples to take for each target-context pair, defaults to 50. + :param sampling_fn: A function to generate samples of tokens, defaults to a random sampler. + :param n_jobs: The number of concurrent threads to be used for routines that are parallelized + with `joblib`, defaults to 1. """ def __init__( self, model_key_selector: Callable[[Utterance], str], tokenizer: Callable[[str], List[str]] = word_tokenize, - surprise_attr_name="surprise", - target_sample_size=100, - context_sample_size=100, - n_samples=50, - sampling_fn: Callable[[np.ndarray, int], np.ndarray] = sample, - smooth: bool = True, + surprise_attr_name: str = "surprise", + target_sample_size: int = 100, + context_sample_size: int = 100, + n_samples: int = 50, + sampling_fn: Callable[ + [List[Union[np.ndarray, List[str]]], int, int], np.ndarray + ] = random_sampler, + n_jobs: int = 1, ): - self.model_key_selector = model_key_selector - self.tokenizer = tokenizer - self.surprise_attr_name = surprise_attr_name - self.target_sample_size = target_sample_size - self.context_sample_size = context_sample_size - self.n_samples = n_samples - self.sampling_fn = sampling_fn - self.smooth = smooth + self._model_key_selector = model_key_selector + self._tokenizer = tokenizer + self._surprise_attr_name = surprise_attr_name + self._target_sample_size = target_sample_size + self._context_sample_size = context_sample_size + self._n_samples = n_samples + self._sampling_fn = sampling_fn + self._n_jobs = n_jobs + self._model_groups = None def fit( self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None, selector: Callable[[Utterance], bool] = lambda utt: True, - ): - """ - Fits a model for each group of utterances in a corpus. The group that an - utterance belongs to is determined by the `model_key_selector` parameter in - the transformer's constructor. - - :param corpus: corpus to create models from. - :param text_func: optional function to define how the text a model is trained - on should be selected. Takes an utterance as input and returns a list of - strings to train the model corresponding to that utterance on. The model - corresponding to the utterance is determined by `self.model_key_selector`. - For every utterance corresponding to the same model key, this function - should return the same result. - If `text_func` is `None`, a model will be trained on the text from all - the utterances that belong to its group. - :param selector: determines which utterances in the corpus to train models for. + ) -> Transformer: + """Populate models for each group of utterances in a corpus. + + For each group of utterances in the corpus, a specific model is populated. The group that an + utterance belongs to is determined by the `model_key_selector` parameter in the constructor. + Furthermore, based on the `tokenizer` specified in the constructor, the text corresponding + to the model key is tokenized. + + :param corpus: The corpus to create models from. + :param text_func: The function used to define how the text a model is trained on should be + selected. Takes an utterance as input and returns a list of strings to train the model + corresponding to that utterance on. The model corresponding to the utterance is + determined by the `model_key_selector` parameter specified in the constructor. For each + utterance corresponding to the same model key, this function should return the same + result. + Defaults to `None`; when the value is `None`, a model will be trained on the text from + all the utterances that belong to its group. + :param selector: A function to specify which utterances in the corpus to train models for. + Defaults to choosing all utterances, `lambda utt: True`. + :return: An instance of the Surprise transformer with the corresponding models populated. """ - self.model_groups = defaultdict(list) - for utt in tqdm(corpus.iter_utterances(selector=selector), desc="fit1"): - key = self.model_key_selector(utt) - if text_func: - if key not in self.model_groups: - self.model_groups[key] = text_func(utt) + self._model_groups = defaultdict(list) + + for utt in tqdm(corpus.iter_utterances(selector=selector), desc="fit"): + key = self._model_key_selector(utt) + if text_func is not None: + if key not in self._model_groups: + self._model_groups[key] = text_func(utt) else: - self.model_groups[key].append(utt.text) - for key in tqdm(self.model_groups, desc="fit2"): - if not text_func: - self.model_groups[key] = [" ".join(self.model_groups[key])] - self.model_groups[key] = list(map(lambda x: self.tokenizer(x), self.model_groups[key])) + self._model_groups[key].append(utt.text) + + for key in tqdm(self._model_groups, desc="fit"): + if text_func is None: + self._model_groups[key] = [" ".join(self._model_groups[key])] + # Using `map()` with `lambda` is (microscopically) costlier than a list comprehension. + # Reference: https://stackoverflow.com/a/1247490/6907625. + self._model_groups[key] = [ + self._tokenizer(utt_text) for utt_text in self._model_groups[key] + ] + return self - def transform( + def _compute_surprise( + self, + target: List[str], + context: List[List[str]], + lm_evaluation_fn: Callable[ + [Union[List[str], np.ndarray], Union[List[str], np.ndarray], Optional[Any]], + np.ndarray, + ], + **kwargs: Optional[Any], + ) -> np.ndarray: + """Compute the amount of "surprise" between target and context utterance(s). + + This method computes how surprising a target text is, when compared to some context. The + amount of "surprise" is measured by comparing the deviation of the target distribution from + the context distribution (e.g., cross-entropy, perplexity). Furthermore, to mitigate the + effects of text length on language model evaluation, several random samples of fixed sizes + are taken from the target and context. + + :param target: A list of tokens in the target. + :param context: A list of lists of tokens in each group of the context. + :param lm_evaluation_fn: The language model evaluation function. If using an instance of + :py:class:`~convokit.LanguageModel`, the :py:meth:`~convokit.LanguageModel.evaluate` + function is to be used here. To see examples of :py:class:`~convokit.LanguageModel`, + see: :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. This + function takes in a list of target samples and corresponding context samples, and + returns the amount of surprise using some underlying language model evaluation metric. + :param kwargs: Additional keyword arguments to be passed to the language model evaluation + function: + + * When using :py:class:`~convokit.LanguageModel`, the following keywords are relevant: + + * `eval_type`: The language model evaluation metric, defaults to "cross_entropy". + * The following arguments, if specified, overwrite the existing class values: + + * `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "language_model". + + * When using :py:class:`~convokit.ConvoKitLanguageModel`, the following keywords are + relevant: + + * `eval_type`: The language model evaluation metric, defaults to "cross_entropy". + * The following arguments, if specified, overwrite the existing class values: + + * `smooth`: Indicator of using Laplace smoothing in the computation of surprise + scores, defaults to `True`. + + * The following arguments, inherited from :py:class:`~convokit.LanguageModel`, if + specified, overwrite the existing class values: + + * `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "convokit_lm". + + * When using :py:class:`~convokit.Kenlm`, the following keywords are relevant: + + * `eval_type`: The language model evaluation metric, defaults to "cross_entropy". + * The following arguments, if specified, overwrite the existing class values: + + * `ngram_order`: The order of n-gram language model. + * `trained_model_filepath`: The filepath to a pre-trained language model that is + to be persistently used. + * `is_persistent`: Indicator of model persistence, i.e., the model generated + in the first pass or that loaded from `trained_model_filepath` is used in all + evaluations. When `trained_model_filepath` is specified, persistence is + automatically implied. + * `kenlm_path`: The folder path to the folder of KenLM library. + * `models_dir`: The folder path to store the (trained) binary KenLM models. + * `model_filename`: The filename used in storing the KenLM model artefacts. + + * The following arguments, inherited from :py:class:`~convokit.LanguageModel`, if + specified, overwrite the existing class values: + + * `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "kenlm". + :return: The surprise score output by the language model evaluation function. + """ + target_tokens = np.array(target) + context_tokens = [np.array(text) for text in context] + target_samples = self._sampling_fn( + [target_tokens], self._target_sample_size, self._n_samples + ) + context_samples = self._sampling_fn( + context_tokens, self._context_sample_size, self._n_samples + ) + + if target_samples is None or context_samples is None: + return np.nan + return lm_evaluation_fn(target_samples, context_samples, **kwargs) + + def _transform( self, corpus: Corpus, obj_type: str, group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, - group_model_attr_key: Callable[[str, str], str] = None, - selector: Callable[[CorpusComponent], bool] = lambda _: True, target_text_func: Callable[[Utterance], List[str]] = None, - ): - """ - Annotates `obj_type` components in a corpus with surprise scores. Should be - called after fit(). - - :param corpus: corpus to compute surprise for. - :param obj_type: the type of corpus components to annotate. Should be either - 'utterance', 'speaker', 'conversation', or 'corpus'. - :param group_and_models: optional function that defines how an utterance should - be grouped to form a target text and what models (contexts) the group should - be compared to when calculating surprise. Takes in an utterance and returns - a tuple containing the name of the group the utterance belongs to and a - list of models to calculate how surprising that group is against. Objects - will be annotated with a metadata field `self.surprise_attr_name` that is - maps a key corresponding to the `groupname` and `modelkey` to the surprise - score for utterances in the group when compared to the model. The key used - is defined by the `group_model_attr_key` parameter. - If `group_and_models` is `None`, `self.model_key_selector` will be used - to select the group that an utterance belongs to. The surprise score will - be calculated for each group of utterances compared to the model in + selector: Callable[[CorpusComponent], bool] = lambda _: True, + group_model_attr_key: Callable[[str, str], str] = None, + **kwargs: Optional[Any], + ) -> Corpus: + """Annotates `obj_type` components in a corpus with surprise scores. + + The transform function adds surprise score metadata to the `obj_type` components in the + given corpus. + + :param corpus: The corpus to compute surprise for. + :param obj_type: The type of corpus components to annotate. Should be one of "utterance", + "speaker", "conversation", or "corpus". + :param group_and_models: A function that defines how an utterance should be grouped to form + a target text and what models (contexts) the group should be compared to in calculating + surprise scores. Takes in an utterance and returns a tuple containing the name of the + group the utterance belongs to and a list of models to calculate how surprising that + group is against. Objects will be annotated with a metadata field `surprise_attr_name` + (specified in the constructor) that maps a key corresponding to the `group_name` and + `model_key` to the surprise score for the utterances in the group when compared to the + model. The key used is defined by the `group_model_attr_key` parameter. + Defaults to `None`; if `group_and_models` is `None`, `model_key_selector` specified in + the constructor will be used to select the group that an utterance belongs to. The + surprise score will be calculated for each group of utterances compared to the model in `self.models` corresponding to the group. - :param group_model_attr_key: optional function to define what key should be used - for a given `groupname` and `modelkey`. - If `group_model_attr_key` is `None`, the default key used will be - "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal - in which case just "modelkey" will be used as the key. - :param selector: function to select objects to annotate. if function returns true, object will be annotated. - :param target_text_func: optional function to define what the target text corresponding to an utterance should be. - takes in an utterance and returns a list of string tokens + :param target_text_func: A function to define what the target text corresponding to an + utterance should be; takes in an utterance and returns a list of string tokens. + Defaults to `None`. + :param selector: A function to specify which objects in the corpus to train models for, + defaults to choosing all `obj_type` objects, `lambda _: True`. + :param group_model_attr_key: A function that defines what key is to be used for a given + `group_name` and `model_key`, defaults to `None`. If `group_model_attr_key` is `None`, + the default key used will be "GROUP_group_name_MODEL_model_key" unless `group_name` and + `model_key` are equal, in which case just "model_key" will be used as the key. + :param kwargs: Additional keyword arguments to be passed for surprise computations (see + the documentation for :py:meth:`~Surprise._compute_surprise()` for these arguments), and + in creating the language model (if needed): + + * `language_model`: An instance of :py:class:`~convokit.LanguageModel` to be used in + computing the surprise scores, defaults to :py:class:`~convokit.ConvoKitLanguageModel` + and the arguments to the :py:class:`~convokit.ConvoKitLanguageModel` can be specified + here as: + + * `smooth`: Indicator of using Laplace smoothing in the computation of surprise + scores, defaults to `True`. + * `n_jobs`: The number of concurrent threads to be used for routines that are + parallelized with `joblib`, defaults to 1. + * `model_type`: The name of :py:class:`~convokit.LanguageModel`, defaults to + "convokit_lm". + :return: A modified version of the input corpus with the surprise scores. """ + + def _update_groups_models( + utt_: Utterance, + utt_groups_: Dict[str, List[List[str]]], + group_models_: Dict[str, Set[str]], + ): + """Updates the utterance groups and models based on `groups_and_models`. + + :param utt_: The utterance whose groups and models are to be populated (updated). + :param utt_groups_: Update utterance groups based on `groups_and_models` parameter. The + dictionary is modified in place. + :param group_models_: Update utterance models based on `groups_and_models` parameter. + The dictionary is modified in place. + """ + group_name, models = ( + group_and_models(utt_) + if group_and_models + else (self._model_key_selector(utt_), None) + ) + models = {group_name} if not models else models + if target_text_func: + if group_name not in utt_groups_: + utt_groups_[group_name] = [target_text_func(utt_)] + else: + utt_groups_[group_name].append(self._tokenizer(utt_.text)) + group_models_[group_name].update(models) + + def _format_attr_key( + group_name: str, model_key: str, format_fn: Callable[[str, str], str] = None + ) -> str: + """Formats the surprise score attribute key, given model name and key. + + :param group_name: The group name to be included in the surprise score attribute key. + :param model_key: The model key to be included in the surprise score attribute key. + :param format_fn: A function that takes in the `group_name` and `model_key` and outputs + the formatted attribute key, defaults to `None`. When `group_model_attr_key` is + `None`, the default key used will be "GROUP_group_name_MODEL_model_key" unless + `group_name` and `model_key` are equal, in which case just "model_key" will be used + as the key. + :return: The formatted surprise score attribute key. + """ + if format_fn: + return format_fn(group_name, model_key) + if group_name == model_key: + return model_key + return f"GROUP_{group_name}__MODEL_{model_key}" + + def __surprise_score_helper( + group_name: str, + utt_group: List[List[str]], + group_models_: Dict[str, Set[str]], + surprise_scores_: Dict[str, np.ndarray], + lm_evaluation_fn: Callable[ + [ + Union[List[str], np.ndarray], + Union[List[str], np.ndarray], + Optional[Any], + ], + np.ndarray, + ], + ): + """A helper function to aid in the computation of surprise scores. + + :param group_name: The group name corresponding to the group model to be used. + :param utt_group: The utterance group from those populated using `groups_and_models`. + :param group_models_: The group models that were populated using `groups_and_models`. + :param surprise_scores_: The surprise score (dictionary value) that is to be updated for + the corresponding utterance group and model. The dictionary is modified in place. + :param lm_evaluation_fn: The language model evaluation function. If using an instance + of :py:class:`~convokit.LanguageModel`, :py:meth:`~convokit.LanguageModel.evaluate` + function is to be used here. To see examples of :py:class:`~convokit.LanguageModel`, + see: :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. + The function takes in a list of target samples and corresponding context samples, + and returns the amount of surprise using some underlying model evaluation metric. + """ + for model_key in group_models_[group_name]: + assert model_key in self._model_groups, "invalid model key" + surprise_key = _format_attr_key(group_name, model_key, group_model_attr_key) + context = self._model_groups[model_key] + target = list(chain(*utt_group)) + surprise_scores_[surprise_key] = self._compute_surprise( + target, context, lm_evaluation_fn, **kwargs + ) + + def _update_surprise_scores( + utt_groups_: Dict[str, List[List[str]]], + group_models_: Dict[str, Set[str]], + surprise_scores_: Dict[str, np.ndarray], + lm_evaluation_fn: Callable[ + [ + Union[List[str], np.ndarray], + Union[List[str], np.ndarray], + Optional[Any], + ], + np.ndarray, + ], + ): + """Populate (update) the surprise score for utterance groups and models. + + :param utt_groups_: The utterance groups that were populated using `groups_and_models`. + :param group_models_: The group models that were populated using `groups_and_models`. + :param surprise_scores_: The surprise scores (dictionary values) that are to be updated + for the corresponding utterance groups and models. The surprise scores dictionary is + modified in place. + :param lm_evaluation_fn: The language model evaluation function. If using an instance + of :py:class:`~convokit.LanguageModel`, the `evaluate` function is to be used here. + To see the subclass implementations of :py:class:`~convokit.LanguageModel`, see: + :py:class:`~convokit.ConvoKitLanguageModel` and :py:class:`~convokit.Kenlm`. The + function takes in a list of target samples and corresponding context samples, and + returns the amount of surprise using some underlying model evaluation metric. + """ + if self._n_jobs == 1: + for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2): + __surprise_score_helper( + group_name, + utt_groups_[group_name], + group_models_, + surprise_scores_, + lm_evaluation_fn, + ) + else: + Parallel(n_jobs=self._n_jobs, backend="threading")( + delayed(__surprise_score_helper)( + group_name, + utt_groups_[group_name], + group_models_, + surprise_scores_, + lm_evaluation_fn, + ) + for group_name in tqdm(utt_groups_, leave=False, desc="surprise", delay=2) + ) + + if "n_jobs" in kwargs and kwargs["n_jobs"] != self._n_jobs: + warnings.warn( + f"specified n_jobs={kwargs['n_jobs']}; however, the surprise transformer was " + f"initialized with {self._n_jobs}, so defaulting to {self._n_jobs} jobs." + ) + kwargs["n_jobs"] = self._n_jobs + language_model = ( + kwargs["language_model"] + if "language_model" in kwargs + else ConvoKitLanguageModel(**kwargs) + ) + if obj_type == "corpus": - utt_groups = defaultdict(list) - group_models = defaultdict(set) - for utt in corpus.iter_utterances(): - if group_and_models: - group_name, models = group_and_models(utt) - else: - group_name = self.model_key_selector(utt) - models = {group_name} - if target_text_func: - if group_name not in utt_groups: - utt_groups[group_name] = [target_text_func(utt)] - else: - utt_groups[group_name].append(self.tokenizer(utt.text)) - group_models[group_name].update(models) - surprise_scores = {} - for group_name in tqdm(utt_groups, desc="transform"): - for model_key in group_models[group_name]: - context = self.model_groups[model_key] - target = list(chain(*utt_groups[group_name])) - surprise_scores[ - Surprise._format_attr_key(group_name, model_key, group_model_attr_key) - ] = self._compute_surprise(target, context) - corpus.add_meta(self.surprise_attr_name, surprise_scores) + surprise_scores = defaultdict() + utt_groups, group_models = defaultdict(list), defaultdict(set) + for utt in tqdm(corpus.iter_utterances(), desc="transform"): + _update_groups_models(utt, utt_groups, group_models) + _update_surprise_scores( + utt_groups, group_models, surprise_scores, language_model.evaluate + ) + corpus.add_meta(self._surprise_attr_name, surprise_scores) elif obj_type == "utterance": for utt in tqdm(corpus.iter_utterances(selector=selector), desc="transform"): - if group_and_models: - group_name, models = group_and_models(utt) - surprise_scores = {} - for model_key in models: - context = self.model_groups[model_key] - target = ( - target_text_func(utt) if target_text_func else self.tokenizer(utt.text) - ) - surprise_scores[ - Surprise._format_attr_key(group_name, model_key, group_model_attr_key) - ] = self._compute_surprise(target, context) - utt.add_meta(self.surprise_attr_name, surprise_scores) - else: - group_name = self.model_key_selector(utt) - context = self.model_groups[group_name] - target = target_text_func(utt) if target_text_func else self.tokenizer(utt.text) - utt.add_meta(self.surprise_attr_name, self._compute_surprise(target, context)) + surprise_scores = defaultdict() + utt_groups, group_models = defaultdict(list), defaultdict(set) + _update_groups_models(utt, utt_groups, group_models) + _update_surprise_scores( + utt_groups, group_models, surprise_scores, language_model.evaluate + ) + utt.add_meta(self._surprise_attr_name, surprise_scores) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc="transform"): - utt_groups = defaultdict(list) - group_models = defaultdict(set) + surprise_scores = defaultdict() + utt_groups, group_models = defaultdict(list), defaultdict(set) for utt in obj.iter_utterances(): - if group_and_models: - group_name, models = group_and_models(utt) - else: - group_name = self.model_key_selector(utt) - models = {group_name} - if target_text_func: - if group_name not in utt_groups: - utt_groups[group_name] = [target_text_func(utt)] - else: - utt_groups[group_name].append(self.tokenizer(utt.text)) - group_models[group_name].update(models) - surprise_scores = {} - for group_name in utt_groups: - for model_key in group_models[group_name]: - assert model_key in self.model_groups, "invalid model key" - if not self.model_groups[model_key]: - continue - context = self.model_groups[model_key] - target = list(chain(*utt_groups[group_name])) - surprise_scores[ - Surprise._format_attr_key(group_name, model_key, group_model_attr_key) - ] = self._compute_surprise(target, context) - obj.add_meta(self.surprise_attr_name, surprise_scores) + _update_groups_models(utt, utt_groups, group_models) + _update_surprise_scores( + utt_groups, group_models, surprise_scores, language_model.evaluate + ) + obj.add_meta(self._surprise_attr_name, surprise_scores) return corpus - def _compute_surprise(self, target: List[str], context: List[List[str]]): - """ - Computes how surprising a target text is based on a context. Surprise scores are calculated using cross entropy. - To mitigate length based effects on cross entropy, several random sample of fixed sizes are taken from the traget and context. - Returns the average of the cross entropies for all pairs of samples. + def transform(self, corpus: Corpus, **kwargs) -> Corpus: + """A wrapper over :py:meth:`~convokit.Surprise._transform` of the Surprise transformer. - :param target: a list of tokens in the target - :param context: a list of lists of tokens in each group of the context + Note: Since the transformer's :py:meth:`~convokit.Surprise.fit` method populates the model + groups, the :py:meth:`~convokit.Surprise.transform` function is to be called after calling + :py:meth:`~convokit.Surprise.fit`. - :return: surprise score + :param corpus: The corpus to transform. + :param kwargs: Any keyword arguments to be passed to :py:meth:`~convokit.Surprise.transform` + function of the Surprise transformer (e.g., `eval_type`). Refer to the documentation of + :py:meth:`~convokit.Surprise._transform()` for specific keyword arguments. + :return: A modified version of the input corpus with the surprise scores. """ - target_tokens = np.array(target) - context_tokens = [np.array(text) for text in context] - target_samples = self.sampling_fn([target_tokens], self.target_sample_size, self.n_samples) - context_samples = self.sampling_fn(context_tokens, self.context_sample_size, self.n_samples) - if target_samples is None or context_samples is None: - return np.nan - return np.nanmean( - [ - _cross_entropy(target_sample, context_sample, self.smooth) - for target_sample, context_sample in zip(target_samples, context_samples) - ] - ) - - @staticmethod - def _format_attr_key(group_name, model_key, format_fn=None): - if format_fn: - return format_fn(group_name, model_key) - if group_name == model_key: - return model_key - return f"GROUP_{group_name}__MODEL_{model_key}" + return self._transform(corpus=corpus, **kwargs) diff --git a/convokit/tests/surprise/__init__.py b/convokit/tests/surprise/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/convokit/tests/surprise/test_convokit_lm.py b/convokit/tests/surprise/test_convokit_lm.py new file mode 100644 index 00000000..7ee02180 --- /dev/null +++ b/convokit/tests/surprise/test_convokit_lm.py @@ -0,0 +1,75 @@ +import unittest + +from convokit import ConvoKitLanguageModel + + +class TestConvoKitLanguageModel(unittest.TestCase): + def _init(self, target_samples, context_samples): + self._target_samples = target_samples + self._context_samples = context_samples + + def test_cross_entropy_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=True) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="cross_entropy" + ) + self.assertEqual(round(float(score), 2), 1.38) + + def test_cross_entropy_no_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=False) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="cross_entropy" + ) + self.assertEqual(round(float(score), 2), 1.04) + + def test_perplexity_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=True) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="perplexity" + ) + self.assertEqual(round(float(score), 2), 4.02) + + def test_perplexity_no_smooth(self): + convokit_lm = ConvoKitLanguageModel(model_type="test_convokit_lm", smooth=False) + score = convokit_lm.evaluate( + self._target_samples, self._context_samples, eval_type="perplexity" + ) + self.assertEqual(round(float(score), 2), 3.00) + + +class TestWithMemory(TestConvoKitLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_cross_entropy_smooth(self): + super().test_cross_entropy_smooth() + + def test_cross_entropy_no_smooth(self): + super().test_cross_entropy_no_smooth() + + def test_perplexity_smooth(self): + super().test_perplexity_smooth() + + def test_perplexity_no_smooth(self): + super().test_perplexity_no_smooth() + + +class TestWithDb(TestConvoKitLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_cross_entropy_smooth(self): + super().test_cross_entropy_smooth() + + def test_cross_entropy_no_smooth(self): + super().test_cross_entropy_no_smooth() + + def test_perplexity_smooth(self): + super().test_perplexity_smooth() + + def test_perplexity_no_smooth(self): + super().test_perplexity_no_smooth() diff --git a/convokit/tests/surprise/test_language_model.py b/convokit/tests/surprise/test_language_model.py new file mode 100644 index 00000000..45161b49 --- /dev/null +++ b/convokit/tests/surprise/test_language_model.py @@ -0,0 +1,139 @@ +import unittest + +import nltk.lm as nltk_lm +from nltk.util import ngrams, everygrams + +from convokit.surprise import language_model + + +class TestLm(language_model.LanguageModel): + def __init__(self): + super().__init__("test_language_model") + + @staticmethod + def eval_func(target, context): + return abs(len(context) - len(target)) + + +class TestNltkLm(language_model.LanguageModel): + def __init__(self, ngram_order=2): + super().__init__("test_nltk_language_model") + self._ngram_order = ngram_order + + def eval_func(self, target, context): + kneser_ney_lm = nltk_lm.models.KneserNeyInterpolated( + order=self._ngram_order, vocabulary=nltk_lm.Vocabulary(target + context) + ) + kneser_ney_lm.fit([everygrams(context, max_len=self._ngram_order)]) + return kneser_ney_lm.entropy(ngrams(target, n=self._ngram_order)) + + +class TestLanguageModel(unittest.TestCase): + def _init(self, target_samples, context_samples): + self._target_samples = target_samples + self._context_samples = context_samples + + def test_model_type(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + self.assertEqual(test_lm.type, "test_language_model") + + def test_model_config(self): + test_lm = language_model.LanguageModel(model_type="test_language_model", smooth=True) + expected_config = {"model_type": "test_language_model", "n_jobs": 1, "smooth": True} + self.assertEqual(test_lm.config, expected_config) + + def test_overwrite_args(self): + test_lm = language_model.LanguageModel(model_type="test_language_model", smooth=True) + try: + test_lm.evaluate(self._target_samples, self._context_samples, smooth=False) + except RuntimeError: + pass + expected_config = {"model_type": "test_language_model", "n_jobs": 1, "smooth": False} + self.assertEqual(test_lm.config, expected_config) + + def test_evaluate_cross_entropy_runtime_error(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + with self.assertRaises(RuntimeError): + test_lm.evaluate(self._target_samples, self._context_samples, "cross_entropy") + + def test_evaluate_perplexity_runtime_error(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + with self.assertRaises(RuntimeError): + test_lm.evaluate(self._target_samples, self._context_samples, "perplexity") + + def test_evaluate_unimplemented_attribute_error(self): + test_lm = language_model.LanguageModel(model_type="test_language_model") + with self.assertRaises(AttributeError): + test_lm.evaluate(self._target_samples, self._context_samples, "unimplemented") + + def test_evaluate(self): + test_lm = TestLm() + score = test_lm.evaluate(self._target_samples, self._context_samples, "eval_func") + self.assertEqual(score, 0.5) + + def test_evaluate_nltk(self): + test_lm = TestNltkLm() + score = test_lm.evaluate(self._target_samples, self._context_samples, "eval_func") + self.assertEqual(round(float(score), 2), 1.25) + + +class TestWithMemory(TestLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_model_type(self): + super().test_model_type() + + def test_model_config(self): + super().test_model_config() + + def test_overwrite_args(self): + super().test_overwrite_args() + + def test_evaluate_cross_entropy_runtime_error(self): + super().test_evaluate_cross_entropy_runtime_error() + + def test_evaluate_perplexity_runtime_error(self): + super().test_evaluate_perplexity_runtime_error() + + def test_evaluate_unimplemented_attribute_error(self): + super().test_evaluate_unimplemented_attribute_error() + + def test_evaluate(self): + super().test_evaluate() + + def test_evaluate_nltk(self): + super().test_evaluate_nltk() + + +class TestWithDb(TestLanguageModel): + def setUp(self) -> None: + self._target_samples = [["this", "is", "test"], ["is", "test"]] + self._context_samples = [["this", "is", "a", "test"], ["this", "test"]] + super()._init(self._target_samples, self._context_samples) + + def test_model_type(self): + super().test_model_type() + + def test_model_config(self): + super().test_model_config() + + def test_overwrite_args(self): + super().test_overwrite_args() + + def test_evaluate_cross_entropy_runtime_error(self): + super().test_evaluate_cross_entropy_runtime_error() + + def test_evaluate_perplexity_runtime_error(self): + super().test_evaluate_perplexity_runtime_error() + + def test_evaluate_unimplemented_attribute_error(self): + super().test_evaluate_unimplemented_attribute_error() + + def test_evaluate(self): + super().test_evaluate() + + def test_evaluate_nltk(self): + super().test_evaluate_nltk() diff --git a/convokit/tests/surprise/test_surprise.py b/convokit/tests/surprise/test_surprise.py new file mode 100644 index 00000000..243f960d --- /dev/null +++ b/convokit/tests/surprise/test_surprise.py @@ -0,0 +1,191 @@ +import random +import unittest + +import numpy as np + +from convokit.surprise import Surprise, ConvoKitLanguageModel +from convokit.tests.test_utils import small_burr_conv_corpus + + +class TestSurprise(unittest.TestCase): + def _init(self, corpus) -> None: + self._corpus = corpus + + def test_fit_model_groups(self): + surprise = Surprise( + model_key_selector=lambda utt: "_".join([utt.speaker.id, utt.conversation_id]) + ) + surprise = surprise.fit(self._corpus) + expected_model_groups = { + "hamilton_0": [["Pardon", "me", "."]], + "hamilton_1": [["Who", "'s", "asking", "?"]], + "hamilton_2": [["Are", "you", "Aaron", "Burr", ",", "sir", "?"]], + "burr_0": [["Are", "you", "Aaron", "Burr", ",", "sir", "?"]], + "burr_1": [["That", "depends", ".", "Pardon", "me", "."]], + "burr_2": [["That", "depends", "."]], + } + self.assertEqual(surprise._model_groups, expected_model_groups) + + def test_fit_model_groups_text_func_selector(self): + surprise = Surprise( + model_key_selector=lambda utt: "_".join([utt.speaker.id, utt.conversation_id]) + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join( + [ + speaker_utt.text + for speaker_utt in utt.speaker.iter_utterances() + if speaker_utt.conversation_id != utt.conversation_id + ] + ) + ], + selector=lambda utt: utt.conversation_id == "0", + ) + expected_model_groups = { + "hamilton_0": [ + ["Who", "'s", "asking", "?", "Are", "you", "Aaron", "Burr", ",", "sir", "?"] + ], + "burr_0": [["That", "depends", ".", "Pardon", "me", ".", "That", "depends", "."]], + } + self.assertEqual(surprise._model_groups, expected_model_groups) + + def test_transform_large_context_target_size(self): + surprise = Surprise(model_key_selector=lambda utt: "corpus") + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance") + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.array([score["corpus"] for score in utts]) + self.assertTrue(np.isnan(surprise_scores).all()) + + def test_transform_multiple_jobs(self): + surprise = Surprise(model_key_selector=lambda utt: "corpus", n_jobs=2) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance", n_jobs=2) + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.array([score["corpus"] for score in utts]) + self.assertTrue(np.isnan(surprise_scores).all()) + + def test_transform_convokit_language_model(self): + random.Random(42) + surprise = Surprise( + model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3 + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + language_model = ConvoKitLanguageModel(smooth=False) + transformed_corpus = surprise.transform( + self._corpus, obj_type="utterance", language_model=language_model + ) + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1) + expected_scores = np.array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]) + self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01)) + + def test_transform_language_model_parameters(self): + random.Random(42) + surprise = Surprise( + model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3 + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance", smooth=False) + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1) + expected_scores = np.array([1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]) + self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01)) + + def test_transform(self): + random.Random(42) + surprise = Surprise( + model_key_selector=lambda utt: "corpus", target_sample_size=3, context_sample_size=3 + ) + surprise = surprise.fit( + self._corpus, + text_func=lambda utt: [ + " ".join([corpus_utt.text for corpus_utt in self._corpus.iter_utterances()]) + ], + ) + transformed_corpus = surprise.transform(self._corpus, obj_type="utterance") + + utts = transformed_corpus.get_utterances_dataframe()["meta.surprise"] + surprise_scores = np.round(np.array([score["corpus"] for score in utts]), 1) + expected_scores = np.array([1.8, 1.7, 1.7, 1.8, 1.7, 1.8, 1.8]) + self.assertTrue(np.allclose(surprise_scores, expected_scores, atol=1e-01)) + + +class TestWithMemory(TestSurprise): + def setUp(self) -> None: + self._small_burr_corpus = small_burr_conv_corpus() + super()._init(self._small_burr_corpus) + + def test_fit_model_groups(self): + super().test_fit_model_groups() + + def test_fit_model_groups_text_func_selector(self): + super().test_fit_model_groups_text_func_selector() + + def test_transform_large_context_target_size(self): + super().test_transform_large_context_target_size() + + def test_transform_multiple_jobs(self): + super().test_transform_multiple_jobs() + + def test_transform_convokit_language_model(self): + super().test_transform_convokit_language_model() + + def test_transform_language_model_parameters(self): + super().test_transform_language_model_parameters() + + def test_transform(self): + super().test_transform() + + +class TestWithDb(TestSurprise): + def setUp(self) -> None: + self._small_burr_corpus = small_burr_conv_corpus() + super()._init(self._small_burr_corpus) + + def test_fit_model_groups(self): + super().test_fit_model_groups() + + def test_fit_model_groups_text_func_selector(self): + super().test_fit_model_groups_text_func_selector() + + def test_transform_large_context_target_size(self): + super().test_transform_large_context_target_size() + + def test_transform_multiple_jobs(self): + super().test_transform_multiple_jobs() + + def test_transform_convokit_language_model(self): + super().test_transform_convokit_language_model() + + def test_transform_language_model_parameters(self): + super().test_transform_language_model_parameters() + + def test_transform(self): + super().test_transform() diff --git a/convokit/tests/test_utils.py b/convokit/tests/test_utils.py index 42bd5759..d9a210ab 100644 --- a/convokit/tests/test_utils.py +++ b/convokit/tests/test_utils.py @@ -9,7 +9,10 @@ FOX_TEXT = "A quick brown fox jumps over the lazy dog." BUFFALO_TEXT = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo" -FOX_BUFFALO_TEXT = "A quick brown fox jumps over the lazy dog. Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo" +FOX_BUFFALO_TEXT = ( + "A quick brown fox jumps over the lazy dog. Buffalo buffalo Buffalo buffalo " + "buffalo buffalo Buffalo buffalo" +) BURR_SIR_TEXT_1 = "Pardon me. Are you Aaron Burr, sir?" BURR_SIR_TEXT_2 = "That depends. Who's asking?" BURR_SIR_SENTENCE_1 = "Pardon me." @@ -129,6 +132,31 @@ def small_burr_corpus(): return Corpus(utterances=utterances) +def small_burr_conv_corpus(): + hamilton = Speaker(id="hamilton") + burr = Speaker(id="burr") + + utterances = [ + Utterance(id="0", text=BURR_SIR_SENTENCE_1, conversation_id="0", speaker=hamilton), + Utterance( + id="1", text=BURR_SIR_SENTENCE_2, conversation_id="0", reply_to="0", speaker=burr + ), + Utterance(id="2", text=BURR_SIR_SENTENCE_3, conversation_id="1", speaker=burr), + Utterance( + id="3", text=BURR_SIR_SENTENCE_4, conversation_id="1", reply_to="2", speaker=hamilton + ), + Utterance( + id="4", text=BURR_SIR_SENTENCE_1, conversation_id="1", reply_to="3", speaker=burr + ), + Utterance(id="5", text=BURR_SIR_SENTENCE_2, conversation_id="2", speaker=hamilton), + Utterance( + id="6", text=BURR_SIR_SENTENCE_3, conversation_id="2", reply_to="5", speaker=burr + ), + ] + + return Corpus(utterances=utterances) + + def small_burr_corpus_parsed(): corpus = small_burr_corpus() utterance_infos = [ diff --git a/convokit/util.py b/convokit/util.py index 6be3f65d..d9f99f7d 100644 --- a/convokit/util.py +++ b/convokit/util.py @@ -1,12 +1,15 @@ import json import os import shutil +import tempfile import urllib.request import uuid import warnings import zipfile -from typing import Dict +from pathlib import Path +from typing import Dict, Union, Optional, List, IO +import numpy as np import requests @@ -238,7 +241,6 @@ def download_local(name: str, data_dir: str): def _download_helper( dataset_path: str, url: str, verbose: bool, name: str, downloadeds_path: str ) -> None: - if ( url.lower().endswith(".corpus") or url.lower().endswith(".corpus.zip") @@ -254,7 +256,15 @@ def _download_helper( if length > 1e6 else str(round(length / 1e3, 1)) + "KB" ) - print("Downloading", name, "from", url, "(" + length + ")...", end=" ", flush=True) + print( + "Downloading", + name, + "from", + url, + "(" + length + ")...", + end=" ", + flush=True, + ) shutil.copyfileobj(response, out_file) # post-process (extract) corpora @@ -278,7 +288,9 @@ def _download_helper( ) # os.path.join(os.path.dirname(data), name) f.write( "{}$#${}$#${}\n".format( - name, os.path.realpath(os.path.dirname(dataset_path) + "/"), corpus_version(fn) + name, + os.path.realpath(os.path.dirname(dataset_path) + "/"), + corpus_version(fn), ) ) # f.write(name + "\n") @@ -292,7 +304,6 @@ def corpus_version(filename: str) -> int: # retrieve grouping and completes the download link for subreddit def get_subreddit_info(subreddit_name: str) -> str: - # base directory of subreddit corpuses subreddit_base = "http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/" data_dir = subreddit_base + "corpus-zipped/" @@ -335,13 +346,17 @@ def _get_wikiconv_year_info(year: str) -> str: def _get_supreme_info(year: str) -> str: - supreme_base = "http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/" return supreme_base + "supreme-" + year + ".zip" def meta_index(corpus=None, filename: str = None) -> Dict: - keys = ["utterances-index", "conversations-index", "speakers-index", "overall-index"] + keys = [ + "utterances-index", + "conversations-index", + "speakers-index", + "overall-index", + ] if corpus is not None: return {k: v for k, v in corpus.meta_index.items() if k in keys} if filename is not None: @@ -379,3 +394,55 @@ def deprecation(prev_name: str, new_name: str, stacklevel: int = 3): def create_safe_id(): return "_" + uuid.uuid4().hex + + +def random_sampler( + tokens: List[Union[np.ndarray, List[str]]], sample_size: int, n_samples: int +) -> Optional[np.ndarray]: + """Generates random samples from a list of lists of tokens. + + :param tokens: A list of lists of tokens to sample from. + :param sample_size: The number of tokens to include in each sample. + :param n_samples: The number of samples to take. + :return: A `numpy.array`, where each row is a sample of tokens. + """ + if not sample_size: + assert len(tokens) == 1 + return np.tile(tokens[0], (n_samples, 1)) + + tokens_list = np.array([tokens_ for tokens_ in tokens if len(tokens_) >= sample_size]) + if tokens_list.shape[0] == 0: + return None + + rng = np.random.default_rng() + sample_idxs = rng.integers(0, tokens_list.shape[0], size=n_samples) + return np.array([rng.choice(tokens_list[idx], sample_size) for idx in sample_idxs]) + + +def create_temp_files(num_files: int) -> List[IO]: + """Creates a specified number of `tempfile` files. + + :param num_files: The number of `tempfile` files to be created. + :return: A list of `tempfile.NamedTemporaryFile` files. + """ + tmp_files = [] + for _ in range(num_files): + tmp_files.append(tempfile.NamedTemporaryFile("w", delete=True)) + return tmp_files + + +def delete_files(tmp_filenames: List[str], remove_parent_dir: bool = True): + """Delete temporary files generated intermittently. + + :param tmp_filenames: The filenames of all the files to be deleted. + :param remove_parent_dir: Indicator of whether the parent directory is to be deleted, if it is + empty after deleting all the temporary files, defaults to True. + """ + tmp_filepaths = [Path(tmp_filename) for tmp_filename in tmp_filenames] + parent_dir = tmp_filepaths[0].parents[0] + + for tmp_filepath in tmp_filepaths: + Path.unlink(tmp_filepath, missing_ok=True) + + if remove_parent_dir and len(list(parent_dir.glob("*"))) == 0: + Path.rmdir(parent_dir) diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index f92d29cc..0a001512 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -16,4 +16,5 @@ These are the transformers related to generating some analysis of the Corpus. Pairer PairedPrediction Ranker + Surprise SpeakerConvoDiversity diff --git a/docs/source/language_model.rst b/docs/source/language_model.rst new file mode 100644 index 00000000..f3b56c13 --- /dev/null +++ b/docs/source/language_model.rst @@ -0,0 +1,21 @@ +Language model +============== + +Implements a language model and defines the `evaluate()` method, to perform +language model evaluation by comparing the deviation of the target distribution +from the context distribution (e.g., cross-entropy, perplexity). + +Base class +---------- + +.. automodule:: convokit.surprise.language_model + :members: + +Subclasses +---------- + +.. automodule:: convokit.surprise.convokit_lm + :members: + +.. automodule:: convokit.surprise.kenlm + :members: diff --git a/docs/source/surprise.rst b/docs/source/surprise.rst new file mode 100644 index 00000000..18bb23ad --- /dev/null +++ b/docs/source/surprise.rst @@ -0,0 +1,24 @@ +Surprise +======== + +Implements the measure of how "surprising" conversations are (e.g., across users +or within user conversations), thereby measuring users' language evolutions over +time. For reference, see the `tie-breaker paper +`_. + +Example usage: `surprise demo +`_, +`tennis demo +`_. + +.. automodule:: convokit.surprise.surprise + :members: + :private-members: _transform, _compute_surprise + +References +---------- + +.. toctree:: + :maxdepth: 3 + + LanguageModel diff --git a/setup.py b/setup.py index c7b41e28..d6cfff81 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ ], extras_require={ "craft": ["torch>=0.12"], + "kenlm": ["kenlm>=0.0.0"], }, classifiers=[ "Programming Language :: Python",