From a65098e5ca4d1c746554d530c62be5b51753dc72 Mon Sep 17 00:00:00 2001 From: Daniel Botros Date: Tue, 12 Dec 2023 12:43:18 -0500 Subject: [PATCH 1/4] Add PosNegIrony transformer and demo --- .../positive_negative_irony/posNegIrony.py | 178 ++++++++++ examples/pos-neg-irony/posNegIrony.ipynb | 321 ++++++++++++++++++ 2 files changed, 499 insertions(+) create mode 100644 convokit/positive_negative_irony/posNegIrony.py create mode 100644 examples/pos-neg-irony/posNegIrony.ipynb diff --git a/convokit/positive_negative_irony/posNegIrony.py b/convokit/positive_negative_irony/posNegIrony.py new file mode 100644 index 00000000..989aefcd --- /dev/null +++ b/convokit/positive_negative_irony/posNegIrony.py @@ -0,0 +1,178 @@ +import math +import nltk +from convokit.transformer import Transformer, Corpus +from inspect import signature +from nltk.sentiment import SentimentIntensityAnalyzer +nltk.download('vader_lexicon') + +class PosNegIronyTransformer(Transformer): + """ + A transformer to label all instances of the token "/s" (ironic utterances) + with a score corresponding to whether or not it is positive or negative irony, + based the degree of sentiment of the utterance and its replies. + + :param obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance' + :param input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field. + :param output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'. + :param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances. + :param verbosity: frequency at which to print status messages when computing attributes. + """ + + def __init__( + self, + obj_type='utterance', + output_field=None, + input_field=None, + input_filter=filter, + verbosity=10000, + ): + if input_filter: + if len(signature(input_filter).parameters) == 1: + self.input_filter = lambda utt: input_filter(self, utt) + else: + self.input_filter = input_filter + else: + self.input_filter = lambda utt: True + self.obj_type = obj_type + self.input_field = input_field + self.output_field = output_field + self.verbosity = verbosity + self.sia = SentimentIntensityAnalyzer() + self.mean = 0 + self.sd = 0 + + def _print_output(self, i): + return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0) + + + def fit(self, corpus: Corpus) -> Corpus: + corpus_sent = {} + corpus_sent["pos"] = 0 + corpus_sent["neg"] = 0 + corpus_sent["neu"] = 0 + corpus_sent["compound"] = 0 + l = 0 + values = [] + + whitelist(self, corpus) + + if self.obj_type == 'utterance': + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d {self.obj_type} processed" % (idx)) + + if self.input_field is None: + text_entry = utterance.text + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + l += 1 + sentiment = self.sia.polarity_scores(text_entry) + corpus_sent["pos"] += sentiment["pos"] + corpus_sent["neg"] += sentiment["neg"] + corpus_sent["neu"] += sentiment["neu"] + corpus_sent["compound"] += sentiment["compound"] + values.append(sentiment["compound"]) + + corpus_sent = {key: value / l for key, value in corpus_sent.items()} + self.mean = corpus_sent["compound"] + + squared_differences = [(x - self.mean) ** 2 for x in values] + variance = sum(squared_differences) / (len(values) - 1) + standard_deviation = math.sqrt(variance) + self.sd = standard_deviation + + return self + + def transform(self, corpus: Corpus) -> Corpus: + """ + + :param corpus: Corpus + :return: the corpus + """ + + if self.obj_type == 'utterance': + total = len(list(corpus.iter_utterances())) + + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) + + if not self.input_filter(self, utterance): + continue + + if self.input_field is None: + if ">" in utterance.text: + try: + text_entry = utterance.text.split("\n")[1] + except: + text_entry = utterance.text.split(".")[1] + else: + text_entry = utterance.text + if " /s " in text_entry: + text_entry = text_entry.split(" \s ")[0] + elif "\n/s" in text_entry: + text_entry = text_entry.split("\n/s")[0] + else: + text_entry = text_entry + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + if " /s " in utterance.text or "\n/s" in utterance.text: + sentiment = self.sia.polarity_scores(text_entry) + convo = utterance.get_conversation() + replies = list(convo.get_subtree(utterance.id).children) + acc_sent = 0 + average_sent = 0 + + if len(replies) > 0: + for reply in replies: + reply_sent = self.sia.polarity_scores(reply.utt.text) + acc_sent += reply_sent["compound"] + reply.utt.add_meta("sentiment", reply_sent) + average_sent = acc_sent / len(replies) + + utterance.add_meta("sentiment", sentiment) + utterance.add_meta("replies_sentiment", average_sent) + agree_score = 0 + + if average_sent == 0: + agree_score = 0 + elif (average_sent <= (self.mean - self.sd*.5) and average_sent >= (self.mean - self.sd*2) and sentiment["compound"] <= (self.mean - self.sd*.5)) or (average_sent >= (self.mean + self.sd*.5) and average_sent <= (self.mean - self.sd*2) and sentiment["compound"] >= (self.mean + self.sd*.5)) or (sentiment["compound"] <= (self.mean - self.sd*.5) and sentiment["compound"] >= (self.mean - self.sd*2) and average_sent <= (self.mean - self.sd*.5)) or (sentiment["compound"] >= (self.mean + self.sd*.5) and sentiment["compound"] <= (self.mean - self.sd*2) and average_sent >= (self.mean + self.sd*.5)): + agree_score = (average_sent + sentiment["compound"])/2 + elif (average_sent < (self.mean - self.sd*2) and sentiment["compound"] < (self.mean - self.sd*2)) or (average_sent > (self.mean + self.sd*2) and sentiment["compound"] > (self.mean + self.sd*2)): + agree_score = -abs((average_sent + sentiment["compound"])/2) + elif (average_sent > (self.mean + self.sd*.5) and sentiment["compound"] < (self.mean - self.sd*.5)) or (average_sent < (self.mean - self.sd*.5) and sentiment["compound"] > (self.mean + self.sd*.5)): + agree_score = (average_sent + -sentiment["compound"])/2 + else: + agree_score = 0 + + utterance.add_meta("agree_score", agree_score) + else: + raise KeyError('obj_type must be utterance') + + + if self.verbosity > 0: + print(f"%03d/%03d {self.obj_type} processed" % (total, total)) + return corpus + +def whitelist(self, corpus: Corpus): + whitelist = [] + for convo in corpus.iter_conversations(): + for utt in convo.iter_utterances(): + if " /s " in utt.text or "\n/s" in utt.text: + whitelist.append(utt.id) + convo = utt.get_conversation() + replies = list(convo.get_subtree(utt.id).bfs_traversal()) + for reply in replies: + if reply.utt.id != utt.id: + whitelist.append(reply.utt.id) + + self.whitelist = whitelist + +def filter(self, utt): + return utt.id in self.whitelist \ No newline at end of file diff --git a/examples/pos-neg-irony/posNegIrony.ipynb b/examples/pos-neg-irony/posNegIrony.ipynb new file mode 100644 index 00000000..0c6ddd43 --- /dev/null +++ b/examples/pos-neg-irony/posNegIrony.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'PosNegIronyTransformer' from 'convokit' (/Users/danielbotros/cs4300-env/lib/python3.7/site-packages/convokit/__init__.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Imports\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mconvokit\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPosNegIronyTransformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'PosNegIronyTransformer' from 'convokit' (/Users/danielbotros/cs4300-env/lib/python3.7/site-packages/convokit/__init__.py)" + ] + } + ], + "source": [ + "# Imports\n", + "from convokit import Corpus, download, PosNegIronyTransformer\n", + "import math\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing Positive Irony In r/Ohio and r/Cleveland, PosNegIronyTransformer Example\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download Corpus', print summary statistics\n", + "cleveland = Corpus(filename=download(\"subreddit-Cleveland\"))\n", + "ohio = Corpus(filename=download(\"subreddit-Ohio\"))\n", + "\n", + "cleveland.print_summary_stats()\n", + "ohio.print_summary_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# See the number of ironic comments in each subreddit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "ohio_cnt = []\n", + "cleveland_cnt = []\n", + "\n", + "for utt in ohio.iter_utterances():\n", + " if \" /s \" or \"\\n/s\" in utt.text:\n", + " ohio_cnt.append(1)\n", + "\n", + "for utt in cleveland.iter_utterances():\n", + " if \" /s \" or \"\\n/s\" in utt.text:\n", + " cleveland_cnt.append(1)\n", + "\n", + "cleveland_cnt = sum(cleveland_cnt)\n", + "ohio_cnt = sum(ohio_cnt)\n", + "\n", + "print(cleveland_cnt)\n", + "\n", + "labels = ['r/Ohio', 'r/Cleveland']\n", + "\n", + "plt.bar(labels, [80, 87])\n", + "plt.xlabel('Subreddit')\n", + "plt.ylabel('Irony Counts')\n", + "plt.title('Number of Ironic Comments in each Subreddit')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fit and transform both Corpus'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformer = PosNegIronyTransformer(obj_type='utterance')\n", + "transformer.fit(cleveland)\n", + "transformer.transform(cleveland)\n", + "transformer.fit(ohio)\n", + "transformer.transform(ohio)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gather statistics about the average thread upvote score of positive ironic comments between r/Ohio and r/Cleveland" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "avg_ohio_score = 0\n", + "pos_irony_ohio_score = 0\n", + "\n", + "avg_cleveland_score = 0\n", + "pos_irony_cleveland_score = 0\n", + "\n", + "cleveland_size = 136087\n", + "ohio_size = 372075\n", + "\n", + "\n", + "\n", + "\n", + "avg_cleveland_length = 0\n", + "for convo in cleveland.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " avg_cleveland_length += 1\n", + " avg_cleveland_score += utt.meta[\"score\"]\n", + "\n", + "avg_cleveland_score = (avg_cleveland_score / avg_cleveland_length) / math.log(cleveland_size)\n", + "\n", + "pos_irony_cleveland_length = 0\n", + "for convo in cleveland.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] > 0: # Positive irony\n", + " convo = utt.get_conversation()\n", + " replies = list(convo.get_subtree(utt.id).bfs_traversal())\n", + " for reply in replies:\n", + " if reply.utt.id != utt.id:\n", + " pos_irony_cleveland_length += 1\n", + " pos_irony_cleveland_score += utt.meta[\"score\"]\n", + "\n", + "pos_irony_cleveland_score = (pos_irony_cleveland_score/pos_irony_cleveland_length)/ math.log(cleveland_size) ## Normalize for community size\n", + "pos_irony_cleveland_score_adj = pos_irony_cleveland_score - avg_cleveland_score\n", + "print(\"Number of comments for all ironic threads\" + str(pos_irony_cleveland_length))\n", + "\n", + "\n", + "\n", + "avg_ohio_length = 0\n", + "for convo in ohio.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " avg_ohio_length += 1\n", + " avg_ohio_score += utt.meta[\"score\"]\n", + "\n", + "avg_ohio_score = (avg_ohio_score / avg_ohio_length) / math.log(ohio_size)\n", + "\n", + "pos_irony_ohio_length = 0\n", + "for convo in ohio.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] > 0: # Positive irony\n", + " convo = utt.get_conversation()\n", + " replies = list(convo.get_subtree(utt.id).bfs_traversal())\n", + " for reply in replies:\n", + " if reply.utt.id != utt.id:\n", + " pos_irony_ohio_length += 1\n", + " pos_irony_ohio_score += utt.meta[\"score\"]\n", + "\n", + "pos_irony_ohio_score = (pos_irony_ohio_score/pos_irony_ohio_length)/ math.log(ohio_size) ## Normalize for community size\n", + "pos_irony_ohio_score_adj = pos_irony_ohio_score - avg_ohio_score\n", + "print(\"Number of comments for all ironic threads\" + str(pos_irony_ohio_length))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(pos_irony_cleveland_score)\n", + "print(pos_irony_cleveland_score_adj)\n", + "print()\n", + "print(pos_irony_ohio_score)\n", + "print(pos_irony_ohio_score_adj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare the average score of positive ironic comments in each subreddit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "community = ['r/Ohio', 'r/Cleveland']\n", + "values = [pos_irony_ohio_score,pos_irony_cleveland_score]\n", + "colors = ['blue', 'orange']\n", + "# Create a bar plot\n", + "plt.bar(community, values, color=colors)\n", + "\n", + "# Add labels and title\n", + "plt.xlabel('Community')\n", + "plt.ylabel('Average comment score of positive ironic comments')\n", + "\n", + "threshold1 = avg_ohio_score\n", + "threshold2 = avg_cleveland_score\n", + "plt.axhline(y=threshold1, color='blue', linestyle='--')\n", + "plt.axhline(y=threshold2, color='orange', linestyle='--')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare the average score of positive ironic comments in each subreddit, adjusted for the community mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "community = ['r/Ohio', 'r/Cleveland']\n", + "values = [pos_irony_ohio_score_adj,pos_irony_cleveland_score_adj]\n", + "colors = ['blue', 'orange']\n", + "# Create a bar plot\n", + "plt.bar(community, values, color=colors)\n", + "\n", + "# Add labels and title\n", + "plt.xlabel('Community')\n", + "plt.ylabel('Difference from community mean of positive irony')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Examine distribution of positive, negative, and neutral / unlabelled irony in both subreddits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pos = 0\n", + "neg = 0\n", + "neu = 0\n", + "\n", + "for utt in cleveland.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] < 0:\n", + " neg += 1\n", + " elif utt.meta[\"agree_score\"] > 0:\n", + " pos += 1\n", + " else:\n", + " neu += 1\n", + "\n", + "for utt in ohio.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] < 0:\n", + " neg += 1\n", + " elif utt.meta[\"agree_score\"] > 0:\n", + " pos += 1\n", + " else:\n", + " neu += 1\n", + "\n", + "labels = ['Neg', 'Neu', 'Pos']\n", + "plt.bar(labels, [neg, neu, pos])\n", + "plt.xlabel('Irony type')\n", + "plt.ylabel('Count')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cs4300-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 80518be34098cca19b07b128398ee80262112ea1 Mon Sep 17 00:00:00 2001 From: Daniel Botros Date: Tue, 12 Dec 2023 12:52:08 -0500 Subject: [PATCH 2/4] Minor quality changes / fixes --- convokit/positive_negative_irony/posNegIrony.py | 2 +- examples/pos-neg-irony/posNegIrony.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/convokit/positive_negative_irony/posNegIrony.py b/convokit/positive_negative_irony/posNegIrony.py index 989aefcd..0a500e4d 100644 --- a/convokit/positive_negative_irony/posNegIrony.py +++ b/convokit/positive_negative_irony/posNegIrony.py @@ -28,7 +28,7 @@ def __init__( ): if input_filter: if len(signature(input_filter).parameters) == 1: - self.input_filter = lambda utt: input_filter(self, utt) + self.input_filter = lambda utt: input_filter(utt) else: self.input_filter = input_filter else: diff --git a/examples/pos-neg-irony/posNegIrony.ipynb b/examples/pos-neg-irony/posNegIrony.ipynb index 0c6ddd43..792950b2 100644 --- a/examples/pos-neg-irony/posNegIrony.ipynb +++ b/examples/pos-neg-irony/posNegIrony.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -12,7 +12,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Imports\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mconvokit\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPosNegIronyTransformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Imports\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mconvokit\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPosNegIronyTransformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mImportError\u001b[0m: cannot import name 'PosNegIronyTransformer' from 'convokit' (/Users/danielbotros/cs4300-env/lib/python3.7/site-packages/convokit/__init__.py)" ] } From 3f14a44064851548b58a7f6ec3067dbae98a352e Mon Sep 17 00:00:00 2001 From: Daniel Botros Date: Tue, 12 Dec 2023 12:53:02 -0500 Subject: [PATCH 3/4] Clearing demo output --- examples/pos-neg-irony/posNegIrony.ipynb | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/examples/pos-neg-irony/posNegIrony.ipynb b/examples/pos-neg-irony/posNegIrony.ipynb index 792950b2..885315aa 100644 --- a/examples/pos-neg-irony/posNegIrony.ipynb +++ b/examples/pos-neg-irony/posNegIrony.ipynb @@ -2,21 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'PosNegIronyTransformer' from 'convokit' (/Users/danielbotros/cs4300-env/lib/python3.7/site-packages/convokit/__init__.py)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Imports\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mconvokit\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPosNegIronyTransformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'PosNegIronyTransformer' from 'convokit' (/Users/danielbotros/cs4300-env/lib/python3.7/site-packages/convokit/__init__.py)" - ] - } - ], + "outputs": [], "source": [ "# Imports\n", "from convokit import Corpus, download, PosNegIronyTransformer\n", From b15eab3ebabe21232ee52cc87ae78f5544903f0c Mon Sep 17 00:00:00 2001 From: Daniel Botros Date: Mon, 18 Dec 2023 16:13:04 -0500 Subject: [PATCH 4/4] Use black formatter --- .../positive_negative_irony/posNegIrony.py | 269 ++++++++++-------- 1 file changed, 152 insertions(+), 117 deletions(-) diff --git a/convokit/positive_negative_irony/posNegIrony.py b/convokit/positive_negative_irony/posNegIrony.py index 0a500e4d..1c9ea290 100644 --- a/convokit/positive_negative_irony/posNegIrony.py +++ b/convokit/positive_negative_irony/posNegIrony.py @@ -3,7 +3,9 @@ from convokit.transformer import Transformer, Corpus from inspect import signature from nltk.sentiment import SentimentIntensityAnalyzer -nltk.download('vader_lexicon') + +nltk.download("vader_lexicon") + class PosNegIronyTransformer(Transformer): """ @@ -20,7 +22,7 @@ class PosNegIronyTransformer(Transformer): def __init__( self, - obj_type='utterance', + obj_type="utterance", output_field=None, input_field=None, input_filter=filter, @@ -44,47 +46,46 @@ def __init__( def _print_output(self, i): return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0) - def fit(self, corpus: Corpus) -> Corpus: - corpus_sent = {} - corpus_sent["pos"] = 0 - corpus_sent["neg"] = 0 - corpus_sent["neu"] = 0 - corpus_sent["compound"] = 0 - l = 0 - values = [] - - whitelist(self, corpus) - - if self.obj_type == 'utterance': - for idx, utterance in enumerate(corpus.iter_utterances()): - if self._print_output(idx): - print(f"%03d {self.obj_type} processed" % (idx)) - - if self.input_field is None: - text_entry = utterance.text - elif isinstance(self.input_field, str): - text_entry = utterance.meta(self.input_field) - if text_entry is None: - continue - - l += 1 - sentiment = self.sia.polarity_scores(text_entry) - corpus_sent["pos"] += sentiment["pos"] - corpus_sent["neg"] += sentiment["neg"] - corpus_sent["neu"] += sentiment["neu"] - corpus_sent["compound"] += sentiment["compound"] - values.append(sentiment["compound"]) - - corpus_sent = {key: value / l for key, value in corpus_sent.items()} - self.mean = corpus_sent["compound"] - - squared_differences = [(x - self.mean) ** 2 for x in values] - variance = sum(squared_differences) / (len(values) - 1) - standard_deviation = math.sqrt(variance) - self.sd = standard_deviation - - return self + corpus_sent = {} + corpus_sent["pos"] = 0 + corpus_sent["neg"] = 0 + corpus_sent["neu"] = 0 + corpus_sent["compound"] = 0 + l = 0 + values = [] + + whitelist(self, corpus) + + if self.obj_type == "utterance": + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d {self.obj_type} processed" % (idx)) + + if self.input_field is None: + text_entry = utterance.text + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + l += 1 + sentiment = self.sia.polarity_scores(text_entry) + corpus_sent["pos"] += sentiment["pos"] + corpus_sent["neg"] += sentiment["neg"] + corpus_sent["neu"] += sentiment["neu"] + corpus_sent["compound"] += sentiment["compound"] + values.append(sentiment["compound"]) + + corpus_sent = {key: value / l for key, value in corpus_sent.items()} + self.mean = corpus_sent["compound"] + + squared_differences = [(x - self.mean) ** 2 for x in values] + variance = sum(squared_differences) / (len(values) - 1) + standard_deviation = math.sqrt(variance) + self.sd = standard_deviation + + return self def transform(self, corpus: Corpus) -> Corpus: """ @@ -93,86 +94,120 @@ def transform(self, corpus: Corpus) -> Corpus: :return: the corpus """ - if self.obj_type == 'utterance': - total = len(list(corpus.iter_utterances())) - - for idx, utterance in enumerate(corpus.iter_utterances()): - if self._print_output(idx): - print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) - - if not self.input_filter(self, utterance): - continue - - if self.input_field is None: - if ">" in utterance.text: - try: - text_entry = utterance.text.split("\n")[1] - except: - text_entry = utterance.text.split(".")[1] - else: - text_entry = utterance.text - if " /s " in text_entry: - text_entry = text_entry.split(" \s ")[0] - elif "\n/s" in text_entry: - text_entry = text_entry.split("\n/s")[0] - else: - text_entry = text_entry - elif isinstance(self.input_field, str): - text_entry = utterance.meta(self.input_field) - if text_entry is None: - continue - - if " /s " in utterance.text or "\n/s" in utterance.text: - sentiment = self.sia.polarity_scores(text_entry) - convo = utterance.get_conversation() - replies = list(convo.get_subtree(utterance.id).children) - acc_sent = 0 - average_sent = 0 - - if len(replies) > 0: - for reply in replies: - reply_sent = self.sia.polarity_scores(reply.utt.text) - acc_sent += reply_sent["compound"] - reply.utt.add_meta("sentiment", reply_sent) - average_sent = acc_sent / len(replies) - - utterance.add_meta("sentiment", sentiment) - utterance.add_meta("replies_sentiment", average_sent) - agree_score = 0 - - if average_sent == 0: - agree_score = 0 - elif (average_sent <= (self.mean - self.sd*.5) and average_sent >= (self.mean - self.sd*2) and sentiment["compound"] <= (self.mean - self.sd*.5)) or (average_sent >= (self.mean + self.sd*.5) and average_sent <= (self.mean - self.sd*2) and sentiment["compound"] >= (self.mean + self.sd*.5)) or (sentiment["compound"] <= (self.mean - self.sd*.5) and sentiment["compound"] >= (self.mean - self.sd*2) and average_sent <= (self.mean - self.sd*.5)) or (sentiment["compound"] >= (self.mean + self.sd*.5) and sentiment["compound"] <= (self.mean - self.sd*2) and average_sent >= (self.mean + self.sd*.5)): - agree_score = (average_sent + sentiment["compound"])/2 - elif (average_sent < (self.mean - self.sd*2) and sentiment["compound"] < (self.mean - self.sd*2)) or (average_sent > (self.mean + self.sd*2) and sentiment["compound"] > (self.mean + self.sd*2)): - agree_score = -abs((average_sent + sentiment["compound"])/2) - elif (average_sent > (self.mean + self.sd*.5) and sentiment["compound"] < (self.mean - self.sd*.5)) or (average_sent < (self.mean - self.sd*.5) and sentiment["compound"] > (self.mean + self.sd*.5)): - agree_score = (average_sent + -sentiment["compound"])/2 - else: - agree_score = 0 - - utterance.add_meta("agree_score", agree_score) + if self.obj_type == "utterance": + total = len(list(corpus.iter_utterances())) + + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) + + if not self.input_filter(self, utterance): + continue + + if self.input_field is None: + if ">" in utterance.text: + try: + text_entry = utterance.text.split("\n")[1] + except: + text_entry = utterance.text.split(".")[1] + else: + text_entry = utterance.text + if " /s " in text_entry: + text_entry = text_entry.split(" \s ")[0] + elif "\n/s" in text_entry: + text_entry = text_entry.split("\n/s")[0] + else: + text_entry = text_entry + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + if " /s " in utterance.text or "\n/s" in utterance.text: + sentiment = self.sia.polarity_scores(text_entry) + convo = utterance.get_conversation() + replies = list(convo.get_subtree(utterance.id).children) + acc_sent = 0 + average_sent = 0 + + if len(replies) > 0: + for reply in replies: + reply_sent = self.sia.polarity_scores(reply.utt.text) + acc_sent += reply_sent["compound"] + reply.utt.add_meta("sentiment", reply_sent) + average_sent = acc_sent / len(replies) + + utterance.add_meta("sentiment", sentiment) + utterance.add_meta("replies_sentiment", average_sent) + agree_score = 0 + + if average_sent == 0: + agree_score = 0 + elif ( + ( + average_sent <= (self.mean - self.sd * 0.5) + and average_sent >= (self.mean - self.sd * 2) + and sentiment["compound"] <= (self.mean - self.sd * 0.5) + ) + or ( + average_sent >= (self.mean + self.sd * 0.5) + and average_sent <= (self.mean - self.sd * 2) + and sentiment["compound"] >= (self.mean + self.sd * 0.5) + ) + or ( + sentiment["compound"] <= (self.mean - self.sd * 0.5) + and sentiment["compound"] >= (self.mean - self.sd * 2) + and average_sent <= (self.mean - self.sd * 0.5) + ) + or ( + sentiment["compound"] >= (self.mean + self.sd * 0.5) + and sentiment["compound"] <= (self.mean - self.sd * 2) + and average_sent >= (self.mean + self.sd * 0.5) + ) + ): + agree_score = (average_sent + sentiment["compound"]) / 2 + elif ( + average_sent < (self.mean - self.sd * 2) + and sentiment["compound"] < (self.mean - self.sd * 2) + ) or ( + average_sent > (self.mean + self.sd * 2) + and sentiment["compound"] > (self.mean + self.sd * 2) + ): + agree_score = -abs((average_sent + sentiment["compound"]) / 2) + elif ( + average_sent > (self.mean + self.sd * 0.5) + and sentiment["compound"] < (self.mean - self.sd * 0.5) + ) or ( + average_sent < (self.mean - self.sd * 0.5) + and sentiment["compound"] > (self.mean + self.sd * 0.5) + ): + agree_score = (average_sent + -sentiment["compound"]) / 2 + else: + agree_score = 0 + + utterance.add_meta("agree_score", agree_score) else: - raise KeyError('obj_type must be utterance') - + raise KeyError("obj_type must be utterance") if self.verbosity > 0: print(f"%03d/%03d {self.obj_type} processed" % (total, total)) return corpus - + + def whitelist(self, corpus: Corpus): - whitelist = [] - for convo in corpus.iter_conversations(): - for utt in convo.iter_utterances(): - if " /s " in utt.text or "\n/s" in utt.text: - whitelist.append(utt.id) - convo = utt.get_conversation() - replies = list(convo.get_subtree(utt.id).bfs_traversal()) - for reply in replies: - if reply.utt.id != utt.id: - whitelist.append(reply.utt.id) - - self.whitelist = whitelist + whitelist = [] + for convo in corpus.iter_conversations(): + for utt in convo.iter_utterances(): + if " /s " in utt.text or "\n/s" in utt.text: + whitelist.append(utt.id) + convo = utt.get_conversation() + replies = list(convo.get_subtree(utt.id).bfs_traversal()) + for reply in replies: + if reply.utt.id != utt.id: + whitelist.append(reply.utt.id) + + self.whitelist = whitelist + def filter(self, utt): - return utt.id in self.whitelist \ No newline at end of file + return utt.id in self.whitelist