-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy path3-aligned.py
68 lines (57 loc) · 2.28 KB
/
3-aligned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from pattern.db import Datasheet
from random import shuffle
# STEP THREE: SETTING UP A TEST FRAMEWORK
# =======================================
# We now have a set of test data (books-fr.csv),
# and a lexicon of adjectives (adj-fr.csv).
# We can use the lexicon to build a sentiment prediction algorithm,
# and evaluate how well it performs on the test data.
data = Datasheet.load("books-fr.csv")
print "number of reviews:", len(data)
# We have 5,444 reviews + score.
# More data = better training material + more reliable testing.
# To set up a test that is statistically solid, we need to "align" the data.
# It is a good idea to remove neutral reviews (= star rating 3),
# and have an equal amount of negative (= star rating 1-2) and positive (= 4-5) reviews.
# This is a form of binary classification:
# Either a review in the test data is positive or it is not.
# Let's look at the distribution of the data:
distribution = {}
for review, score in data:
score = float(score)
if score not in distribution:
distribution[score] = 0
distribution[score] += 1
print "distribution of reviews by star rating:", distribution
# As can be expected, the data is skewed: 519 negative vs. 4,925 positive reviews.
# People tend to give positive reviews more easily.
# If we don't align the data, our test will be biased.
# An algorithm that is good at detecting positive reviews will do very well,
# while in reality in might be very bad at detecting negative reviews
# (for example, it could be predicting *all* test reviews as positive).
aligned = {
-1: [],
+1: []
}
for review, score in data:
score = float(score)
if score == 3: # Discard neutral reviews.
continue
if score < 3:
aligned[-1].append(review)
if score > 3:
aligned[+1].append(review)
m = min(len(aligned[-1]), len(aligned[+1]))
m = min(m, 500)
aligned[-1] = aligned[-1][:m]
aligned[+1] = aligned[+1][:m]
print "aligned test corpus:"
print len(aligned[-1])
print len(aligned[+1])
# The aligned list contains (review, positive)-tuples,
# where positive is either True or False,
# with an equal amount of True and False reviews:
aligned = [(review, False) for review in aligned[-1]] + \
[(review, True) for review in aligned[+1]]
shuffle(aligned)
Datasheet(aligned).save("books-fr.test.csv")