-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsentiment_class.py
207 lines (171 loc) · 7.82 KB
/
sentiment_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
-------------------------------------------------------------------
-- Project: Analysis of Coronavirus related Tweets using TwitterAPI
-- Author: Georgios Spyrou
-- Last Updated: 19/09/2020
-------------------------------------------------------------------
"""
import matplotlib.pyplot as plt
from collections import Counter
from pandas import DataFrame
from wordcloud import WordCloud
from seaborn import barplot, countplot
from nltk.collocations import BigramCollocationFinder
from nltk import word_tokenize
def month_as_string(month_as_int: int) -> str:
"""
Take an integer as input representing a month, and return the corresponding
month as a string (e.g. 1 -> January)
"""
year_dict = {1: 'January', 2: 'February', 3: 'March', 4: 'April',
5: 'May', 6: 'June', 7: 'July', 8: 'August',
9: 'September', 10: 'October', 11: 'Novermber',
12: 'December'}
return year_dict[month_as_int]
class TwitterSentiment:
def __init__(self, input_df, tweet_column):
self.df = input_df
self.year = None
self.month = None
self.tweet_column = tweet_column
def subset_dataframe(self, year: int, month: int):
"""
Filter the master dataframe on a subset depending on a combination of
year and month of interest.
"""
self.year = year
self.month = month
if year not in list(self.df['Year'].unique()):
print('This year does not exist')
elif month not in list(self.df['Month'].unique()):
print('This month does not exist')
else:
self.df = self.df[(self.df['Year'] == year) &
(self.df['Month'] == month)]
def most_common_words(self, tweet_column: str, n_most_common=20):
"""
Calculate the most common words of a categorical column, usually in a
format of text.
Args:
------
input_df: Dataframe that contains the relevant text column
col: Name of the column
year: If not None, then indicate year of tweet was made
month: If not None, then indicate month of tweet was made
n_most_common: Number of most common words to calculate
Returns:
--------
Pandas dataframe with two columns indicating a word and number
of times (count) that it appears in the original input_df
"""
word_list = list([x.split() for x in self.df[self.tweet_column] if x is not None])
word_counter = Counter(x for xs in word_list for x in set(xs))
word_counter.most_common(n_most_common)
self.common_words_df = DataFrame(word_counter.most_common(
n_most_common), columns=['words', 'count'])
return self.common_words_df
def plot_most_common_words(self, n_most_common=20, figsize=(10, 10)) -> None:
fig, ax = plt.subplots(figsize=figsize)
common_words_df = self.most_common_words(tweet_column=self.tweet_column,
n_most_common=n_most_common)
barplot(x='count', y='words', data=common_words_df)
plt.grid(True, alpha=0.3, linestyle='-', color='black')
if self.year is not None and self.month is not None:
ax.set_title(f'Common Words Found in Tweets - {month_as_string(self.month)} {self.year}',
fontweight='bold')
else:
ax.set_title(f'Common Words Found in Tweets - Overall', fontweight='bold')
plt.show()
def plot_wordcloud(self, figsize=(10, 10)) -> None:
"""
Generate a WordCloud plot based on the number of occurenences of words
in a set of text or documents
"""
plt.figure(figsize=figsize)
gen_text = ' '.join([x for x in self.df[self.tweet_column] if x is not None])
wordcloud = WordCloud().generate(gen_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
def compute_bigrams(self) -> dict:
"""
Calculate the number of occurences that a pair of words appear next to
each other, and return a dictionary of pair of words - count.
"""
tweets_to_string = ' '.join([x for x in self.df[self.tweet_column]])
finder = BigramCollocationFinder.from_words(word_tokenize(tweets_to_string))
bigrams_dict = {}
for k, v in finder.ngram_fd.items():
# Condition to avoid characters like '@' and '#'
if len(k[0]) > 1 and len(k[1]) > 1 and "'s" not in k:
bigrams_dict[k] = v
else:
continue
return bigrams_dict
def plot_bigrams(self, top_n: int, figsize=(10, 8)) -> None:
bigrams_dict = self.compute_bigrams()
sortedBiGrams = sorted(bigrams_dict.items(), key=lambda x: x[1],
reverse=True)[0:top_n]
bgram, counts = list(zip(*sortedBiGrams))
bgstring = list(map(lambda txt: '-'.join(txt), bgram))
plt.figure(figsize=figsize)
g = barplot(bgstring, counts, palette='muted')
g.set_xticklabels(g.get_xticklabels(), rotation=80)
plt.title(f'Top-{top_n} pairs of words that appear next to each other',
fontweight='bold')
plt.ylabel('Count')
plt.grid(True, alpha=0.2, color='black')
plt.show()
def liu_hu_opinion_lexicon(self, sentence: str) -> str:
"""
Modified version of the Liu Hu opinion lexicon algorithm for sentiment
analysis on sentences.
Reference: https://www.nltk.org/_modules/nltk/sentiment/util.html#demo_liu_hu_lexicon
The function has been modified to return the values instead of printing
Returns:
--------
Sentiment of a text, classified as 'Positive','Negative' or 'Neutral'
"""
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
pos_words, neg_words = 0, 0
y = []
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
for word in tokenized_sent:
if word in opinion_lexicon.positive():
pos_words += 1
y.append(1) # positive
elif word in opinion_lexicon.negative():
neg_words += 1
y.append(-1) # negative
else:
y.append(0) # neutral
if pos_words > neg_words:
return('Positive')
elif pos_words < neg_words:
return('Negative')
elif pos_words == neg_words:
return('Neutral')
def calculate_sentiment(self):
"""
Calculate the sentiment of a column by using the liu_hu_opinion_lexicon method
and output the result into a Sentiment column
"""
self.df['Sentiment'] = self.df[self.tweet_column].apply(lambda tweet:
self.liu_hu_opinion_lexicon(tweet))
def plot_sentiment(self, sentiment_month=None, year=None, figsize=(10, 8)) -> None:
plt.figure(figsize=figsize)
if sentiment_month is not None:
sentiment_df = self.df[self.df['Month'] == sentiment_month]
title_str = f'Sentiment Classification of Tweets - {month_as_string(sentiment_month)} {year}'
else:
sentiment_df = self.df
title_str = 'Sentiment Classification of Tweets'
clr_palette = {'Positive': '#37B41E', 'Neutral': '#1B9EC8', 'Negative':'#C92528'}
g = countplot(x='Sentiment', data=sentiment_df, palette=clr_palette, order=['Negative', 'Neutral', 'Positive'])
g.set_xticklabels(g.get_xticklabels(), rotation=0)
plt.title(title_str, fontweight='bold')
plt.ylabel('Count', labelpad=8)
plt.xlabel('Sentiment', labelpad=8)
plt.show()