-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsentiment_analysis.py
More file actions
103 lines (95 loc) · 4.19 KB
/
sentiment_analysis.py
File metadata and controls
103 lines (95 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Ok, so if we want to figure out how positive/negative certain characters are,
# we want to perform sentiment analysis.
# So let's import nltk's sentiment analyzer package and the sentence tokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import os
import numpy as np
import glob
import matplotlib.pyplot as plt
# Now that we've imported the required stuff, we have to get the lines spoken by
# each character. To do this using memory efficiently, let's calculate our final
# stats for each character separately.
main_characters = [
'Phineas',
'Ferb',
'Candace',
'Isabella',
'Buford',
'Baljeet',
'Linda',
'Doofenshmirtz',
'Perry',
'Major Monogram',
'Carl',
'Stacy',
'Jeremy',
'Lawrence',
'Vanessa',
]
# Define data struct for storing final positivity/negativity scores
pos_neg_scores = {}
# Let's get a list of all the scripts
s1_script_paths = sorted(glob.glob('transcripts/season_1/*.txt'), key=os.path.getmtime)
s2_script_paths = sorted(glob.glob('transcripts/season_2/*.txt'), key=os.path.getmtime)
s3_script_paths = sorted(glob.glob('transcripts/season_3/*.txt'), key=os.path.getmtime)
s4_script_paths = sorted(glob.glob('transcripts/season_4/*.txt'), key=os.path.getmtime)
script_paths = [s1_script_paths, s2_script_paths, s3_script_paths, s4_script_paths]
# For each character, we want to go through each episode script from each season
# and store their lines in a list. Once we've got all their lines, we can get
# their compound scores for each line, and then take the mean of the compound
# scores to get an overall positive/negative rating for the character.
for character in main_characters:
character_lines = []
for season in range(0, 4):
for episode in script_paths[season]:
script = open(episode, 'r')
for line in script:
if line[:1] == '(' and line.strip()[-1:] == ')':
continue
if line[:1] == '\xe2' and line.strip()[-1:] == '\xe2':
continue
if line.strip() == '' or line.strip() == 'End Credits':
continue
if line.strip() == 'Part I' or line.strip() == 'Part II':
continue
character_name = line.split(':')[0]
if character_name == character:
# tokenize line because it might be more than one actual line.
character_lines.extend(tokenize.sent_tokenize(line))
# handle Major Monogram's edge case
if character_name == 'Monogram' and character == 'Major Monogram':
character_lines.extend(tokenize.sent_tokenize(line))
# now we have all the lines for a character, we can analyze
sid = SentimentIntensityAnalyzer() # haha i'm sid and i'm calling this sid
pos_neg_scores[character] = []
for sentence in character_lines:
scores = sid.polarity_scores(sentence)
pos_neg_scores[character].append(scores['compound'])
pos_neg_scores[character] = np.array(pos_neg_scores[character])
# Now that each character has a numpy array associated with them for their subjectivity
# scores, I can plot their distributions.
# After plotting distributions, I see that a large chunk of lines are neutral.
# How large is that chunk, you ask? I'll answer.
for fig in range(0, len(main_characters)):
plt.figure(fig)
character = main_characters[fig]
plt.hist(pos_neg_scores[character], 10, normed=1, facecolor='g', alpha=0.75)
pos = []
neg = []
neu = []
for score in np.nditer(pos_neg_scores[character]):
if score <= -0.5:
neg.append(score)
elif score >= 0.5:
pos.append(score)
else:
neu.append(score)
total_len = pos_neg_scores[character].size
pos_percent = round(len(pos)/total_len * 100, 3)
neg_percent = round(len(neg)/total_len * 100, 3)
neu_percent = round(len(neu)/total_len * 100, 3)
plt.title('Histogram of {}\'s Positivity Per Line'.format(character))
plt.xlabel('Positivity Scores\nNegative: {}%\nNeutral: {}%\nPositive: {}%'.format(neg_percent, neu_percent, pos_percent))
plt.ylabel('Number of Lines')
plt.show()