diff --git a/convokit/positive_negative_irony/posNegIrony.py b/convokit/positive_negative_irony/posNegIrony.py new file mode 100644 index 00000000..1c9ea290 --- /dev/null +++ b/convokit/positive_negative_irony/posNegIrony.py @@ -0,0 +1,213 @@ +import math +import nltk +from convokit.transformer import Transformer, Corpus +from inspect import signature +from nltk.sentiment import SentimentIntensityAnalyzer + +nltk.download("vader_lexicon") + + +class PosNegIronyTransformer(Transformer): + """ + A transformer to label all instances of the token "/s" (ironic utterances) + with a score corresponding to whether or not it is positive or negative irony, + based the degree of sentiment of the utterance and its replies. + + :param obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance' + :param input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field. + :param output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'. + :param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances. + :param verbosity: frequency at which to print status messages when computing attributes. + """ + + def __init__( + self, + obj_type="utterance", + output_field=None, + input_field=None, + input_filter=filter, + verbosity=10000, + ): + if input_filter: + if len(signature(input_filter).parameters) == 1: + self.input_filter = lambda utt: input_filter(utt) + else: + self.input_filter = input_filter + else: + self.input_filter = lambda utt: True + self.obj_type = obj_type + self.input_field = input_field + self.output_field = output_field + self.verbosity = verbosity + self.sia = SentimentIntensityAnalyzer() + self.mean = 0 + self.sd = 0 + + def _print_output(self, i): + return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0) + + def fit(self, corpus: Corpus) -> Corpus: + corpus_sent = {} + corpus_sent["pos"] = 0 + corpus_sent["neg"] = 0 + corpus_sent["neu"] = 0 + corpus_sent["compound"] = 0 + l = 0 + values = [] + + whitelist(self, corpus) + + if self.obj_type == "utterance": + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d {self.obj_type} processed" % (idx)) + + if self.input_field is None: + text_entry = utterance.text + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + l += 1 + sentiment = self.sia.polarity_scores(text_entry) + corpus_sent["pos"] += sentiment["pos"] + corpus_sent["neg"] += sentiment["neg"] + corpus_sent["neu"] += sentiment["neu"] + corpus_sent["compound"] += sentiment["compound"] + values.append(sentiment["compound"]) + + corpus_sent = {key: value / l for key, value in corpus_sent.items()} + self.mean = corpus_sent["compound"] + + squared_differences = [(x - self.mean) ** 2 for x in values] + variance = sum(squared_differences) / (len(values) - 1) + standard_deviation = math.sqrt(variance) + self.sd = standard_deviation + + return self + + def transform(self, corpus: Corpus) -> Corpus: + """ + + :param corpus: Corpus + :return: the corpus + """ + + if self.obj_type == "utterance": + total = len(list(corpus.iter_utterances())) + + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) + + if not self.input_filter(self, utterance): + continue + + if self.input_field is None: + if ">" in utterance.text: + try: + text_entry = utterance.text.split("\n")[1] + except: + text_entry = utterance.text.split(".")[1] + else: + text_entry = utterance.text + if " /s " in text_entry: + text_entry = text_entry.split(" \s ")[0] + elif "\n/s" in text_entry: + text_entry = text_entry.split("\n/s")[0] + else: + text_entry = text_entry + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + if " /s " in utterance.text or "\n/s" in utterance.text: + sentiment = self.sia.polarity_scores(text_entry) + convo = utterance.get_conversation() + replies = list(convo.get_subtree(utterance.id).children) + acc_sent = 0 + average_sent = 0 + + if len(replies) > 0: + for reply in replies: + reply_sent = self.sia.polarity_scores(reply.utt.text) + acc_sent += reply_sent["compound"] + reply.utt.add_meta("sentiment", reply_sent) + average_sent = acc_sent / len(replies) + + utterance.add_meta("sentiment", sentiment) + utterance.add_meta("replies_sentiment", average_sent) + agree_score = 0 + + if average_sent == 0: + agree_score = 0 + elif ( + ( + average_sent <= (self.mean - self.sd * 0.5) + and average_sent >= (self.mean - self.sd * 2) + and sentiment["compound"] <= (self.mean - self.sd * 0.5) + ) + or ( + average_sent >= (self.mean + self.sd * 0.5) + and average_sent <= (self.mean - self.sd * 2) + and sentiment["compound"] >= (self.mean + self.sd * 0.5) + ) + or ( + sentiment["compound"] <= (self.mean - self.sd * 0.5) + and sentiment["compound"] >= (self.mean - self.sd * 2) + and average_sent <= (self.mean - self.sd * 0.5) + ) + or ( + sentiment["compound"] >= (self.mean + self.sd * 0.5) + and sentiment["compound"] <= (self.mean - self.sd * 2) + and average_sent >= (self.mean + self.sd * 0.5) + ) + ): + agree_score = (average_sent + sentiment["compound"]) / 2 + elif ( + average_sent < (self.mean - self.sd * 2) + and sentiment["compound"] < (self.mean - self.sd * 2) + ) or ( + average_sent > (self.mean + self.sd * 2) + and sentiment["compound"] > (self.mean + self.sd * 2) + ): + agree_score = -abs((average_sent + sentiment["compound"]) / 2) + elif ( + average_sent > (self.mean + self.sd * 0.5) + and sentiment["compound"] < (self.mean - self.sd * 0.5) + ) or ( + average_sent < (self.mean - self.sd * 0.5) + and sentiment["compound"] > (self.mean + self.sd * 0.5) + ): + agree_score = (average_sent + -sentiment["compound"]) / 2 + else: + agree_score = 0 + + utterance.add_meta("agree_score", agree_score) + else: + raise KeyError("obj_type must be utterance") + + if self.verbosity > 0: + print(f"%03d/%03d {self.obj_type} processed" % (total, total)) + return corpus + + +def whitelist(self, corpus: Corpus): + whitelist = [] + for convo in corpus.iter_conversations(): + for utt in convo.iter_utterances(): + if " /s " in utt.text or "\n/s" in utt.text: + whitelist.append(utt.id) + convo = utt.get_conversation() + replies = list(convo.get_subtree(utt.id).bfs_traversal()) + for reply in replies: + if reply.utt.id != utt.id: + whitelist.append(reply.utt.id) + + self.whitelist = whitelist + + +def filter(self, utt): + return utt.id in self.whitelist diff --git a/examples/pos-neg-irony/posNegIrony.ipynb b/examples/pos-neg-irony/posNegIrony.ipynb new file mode 100644 index 00000000..885315aa --- /dev/null +++ b/examples/pos-neg-irony/posNegIrony.ipynb @@ -0,0 +1,309 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "from convokit import Corpus, download, PosNegIronyTransformer\n", + "import math\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing Positive Irony In r/Ohio and r/Cleveland, PosNegIronyTransformer Example\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download Corpus', print summary statistics\n", + "cleveland = Corpus(filename=download(\"subreddit-Cleveland\"))\n", + "ohio = Corpus(filename=download(\"subreddit-Ohio\"))\n", + "\n", + "cleveland.print_summary_stats()\n", + "ohio.print_summary_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# See the number of ironic comments in each subreddit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "ohio_cnt = []\n", + "cleveland_cnt = []\n", + "\n", + "for utt in ohio.iter_utterances():\n", + " if \" /s \" or \"\\n/s\" in utt.text:\n", + " ohio_cnt.append(1)\n", + "\n", + "for utt in cleveland.iter_utterances():\n", + " if \" /s \" or \"\\n/s\" in utt.text:\n", + " cleveland_cnt.append(1)\n", + "\n", + "cleveland_cnt = sum(cleveland_cnt)\n", + "ohio_cnt = sum(ohio_cnt)\n", + "\n", + "print(cleveland_cnt)\n", + "\n", + "labels = ['r/Ohio', 'r/Cleveland']\n", + "\n", + "plt.bar(labels, [80, 87])\n", + "plt.xlabel('Subreddit')\n", + "plt.ylabel('Irony Counts')\n", + "plt.title('Number of Ironic Comments in each Subreddit')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fit and transform both Corpus'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformer = PosNegIronyTransformer(obj_type='utterance')\n", + "transformer.fit(cleveland)\n", + "transformer.transform(cleveland)\n", + "transformer.fit(ohio)\n", + "transformer.transform(ohio)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gather statistics about the average thread upvote score of positive ironic comments between r/Ohio and r/Cleveland" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "avg_ohio_score = 0\n", + "pos_irony_ohio_score = 0\n", + "\n", + "avg_cleveland_score = 0\n", + "pos_irony_cleveland_score = 0\n", + "\n", + "cleveland_size = 136087\n", + "ohio_size = 372075\n", + "\n", + "\n", + "\n", + "\n", + "avg_cleveland_length = 0\n", + "for convo in cleveland.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " avg_cleveland_length += 1\n", + " avg_cleveland_score += utt.meta[\"score\"]\n", + "\n", + "avg_cleveland_score = (avg_cleveland_score / avg_cleveland_length) / math.log(cleveland_size)\n", + "\n", + "pos_irony_cleveland_length = 0\n", + "for convo in cleveland.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] > 0: # Positive irony\n", + " convo = utt.get_conversation()\n", + " replies = list(convo.get_subtree(utt.id).bfs_traversal())\n", + " for reply in replies:\n", + " if reply.utt.id != utt.id:\n", + " pos_irony_cleveland_length += 1\n", + " pos_irony_cleveland_score += utt.meta[\"score\"]\n", + "\n", + "pos_irony_cleveland_score = (pos_irony_cleveland_score/pos_irony_cleveland_length)/ math.log(cleveland_size) ## Normalize for community size\n", + "pos_irony_cleveland_score_adj = pos_irony_cleveland_score - avg_cleveland_score\n", + "print(\"Number of comments for all ironic threads\" + str(pos_irony_cleveland_length))\n", + "\n", + "\n", + "\n", + "avg_ohio_length = 0\n", + "for convo in ohio.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " avg_ohio_length += 1\n", + " avg_ohio_score += utt.meta[\"score\"]\n", + "\n", + "avg_ohio_score = (avg_ohio_score / avg_ohio_length) / math.log(ohio_size)\n", + "\n", + "pos_irony_ohio_length = 0\n", + "for convo in ohio.iter_conversations():\n", + " for utt in convo.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] > 0: # Positive irony\n", + " convo = utt.get_conversation()\n", + " replies = list(convo.get_subtree(utt.id).bfs_traversal())\n", + " for reply in replies:\n", + " if reply.utt.id != utt.id:\n", + " pos_irony_ohio_length += 1\n", + " pos_irony_ohio_score += utt.meta[\"score\"]\n", + "\n", + "pos_irony_ohio_score = (pos_irony_ohio_score/pos_irony_ohio_length)/ math.log(ohio_size) ## Normalize for community size\n", + "pos_irony_ohio_score_adj = pos_irony_ohio_score - avg_ohio_score\n", + "print(\"Number of comments for all ironic threads\" + str(pos_irony_ohio_length))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(pos_irony_cleveland_score)\n", + "print(pos_irony_cleveland_score_adj)\n", + "print()\n", + "print(pos_irony_ohio_score)\n", + "print(pos_irony_ohio_score_adj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare the average score of positive ironic comments in each subreddit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "community = ['r/Ohio', 'r/Cleveland']\n", + "values = [pos_irony_ohio_score,pos_irony_cleveland_score]\n", + "colors = ['blue', 'orange']\n", + "# Create a bar plot\n", + "plt.bar(community, values, color=colors)\n", + "\n", + "# Add labels and title\n", + "plt.xlabel('Community')\n", + "plt.ylabel('Average comment score of positive ironic comments')\n", + "\n", + "threshold1 = avg_ohio_score\n", + "threshold2 = avg_cleveland_score\n", + "plt.axhline(y=threshold1, color='blue', linestyle='--')\n", + "plt.axhline(y=threshold2, color='orange', linestyle='--')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare the average score of positive ironic comments in each subreddit, adjusted for the community mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "community = ['r/Ohio', 'r/Cleveland']\n", + "values = [pos_irony_ohio_score_adj,pos_irony_cleveland_score_adj]\n", + "colors = ['blue', 'orange']\n", + "# Create a bar plot\n", + "plt.bar(community, values, color=colors)\n", + "\n", + "# Add labels and title\n", + "plt.xlabel('Community')\n", + "plt.ylabel('Difference from community mean of positive irony')\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Examine distribution of positive, negative, and neutral / unlabelled irony in both subreddits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pos = 0\n", + "neg = 0\n", + "neu = 0\n", + "\n", + "for utt in cleveland.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] < 0:\n", + " neg += 1\n", + " elif utt.meta[\"agree_score\"] > 0:\n", + " pos += 1\n", + " else:\n", + " neu += 1\n", + "\n", + "for utt in ohio.iter_utterances():\n", + " if \" /s \" in utt.text or \"\\n/s\" in utt.text:\n", + " if utt.meta[\"agree_score\"] < 0:\n", + " neg += 1\n", + " elif utt.meta[\"agree_score\"] > 0:\n", + " pos += 1\n", + " else:\n", + " neu += 1\n", + "\n", + "labels = ['Neg', 'Neu', 'Pos']\n", + "plt.bar(labels, [neg, neu, pos])\n", + "plt.xlabel('Irony type')\n", + "plt.ylabel('Count')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cs4300-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}