-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy path2-amazon.py
112 lines (95 loc) · 5.26 KB
/
2-amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from pattern.web import URL, DOM, plaintext
from pattern.db import Datasheet
# STEP TWO: ACQUIRING TEST DATA
# =============================
# For testing, we need real-world data (Twitter messages, book reviews, blog posts, ...)
# Many online reviews use a "star rating": * ** *** **** or *****.
# This is essentially a numeric value from 1-5 (very bad => very good).
# This number is ideal for statistical analysis.
# It basically represents the opinion hidden somewhere in the text.
# If we don't have something like a star rating, we need to tag the data by hand.
# Otherwise we can't analyze how well the algorithm performs.
# There is a big online collection of human opinions:
# http://www.amazon.fr/
# We're not allowed to mine Amazon,
# but since we will only use their reviews for testing we could argue to do it anyway.
# I used the Chrome browser to visit http://www.amazon.fr,
# clicked "Livres" > "Litterature" > "Litterature francaise".
# I end up at a page which displays an overview of popular books.
# Each link in the overview leads to a book page with customer reviews + star rating.
# When I navigate the overview (page 2 and 3), I end up at the following URL's:
# http://www.amazon.fr/s/ref=sr_pg_2?rh=n%3A301061%2Cn%3A%21301130%2Cn%3A301132%2Cn%3A302038&page=2&ie=UTF8&qid=1354668977
# http://www.amazon.fr/s/ref=sr_pg_3?rh=n%3A301061%2Cn%3A%21301130%2Cn%3A301132%2Cn%3A302038&page=3&ie=UTF8&qid=1354669111
# We can throw out the last "ie" and "qid" parts it seems.
# Also, the "current page" is defined by the number in the url, which we can replace with a variable:
def books(i):
return "http://www.amazon.fr/s/ref=sr_pg_" + str(i) + \
"?rh=n%3A301061%2Cn%3A%21301130%2Cn%3A301132%2Cn%3A302038&page=" + str(i)
# Try it out by pasting the URL in a browser:
#print books(1)
#print books(2)
# We can use Chrome's Developer Tools to inspect the HTML of the overview page.
# It turns out each link to each book is contained in a <div class="prod"> element.
# In Pattern, the DOM (Document Object Model) is a tree of nested HTML elements,
# along with useful methods to traverse and search the tree.
# http://www.clips.ua.ac.be/pages/pattern-web#DOM
# It is easy to fetch each <div class="prod">:
corpus = Datasheet()
for i in range(45): # How many pages?
url = books(i+1)
url = URL(url)
html = url.download(cached=True) # Cache the HTML source locally.
for product in DOM(html).by_class("prod"):
#print product.source
# The link to each book page looks something like:
# http://www.amazon.fr/dieux-voyagent-toujours-incognito/dp/2266219154/
a = product.by_tag("a")[0]
a = a.attributes["href"]
#print a
# After some searching with Chrome,
# I found that there is a page with 10 reviews about this book:
# http://www.amazon.fr/product-reviews/2266219154/
# So we want to parse the book id from the first link and mine its reviews page:
id = a.split("/")[-2]
reviews = "http://www.amazon.fr/product-reviews/" + id + "/"
print reviews
# We can use Chrome's Developer Tools to inspect the HTML of the review page.
# It turns out the reviews are contained in a <table id="productReviews"> element.
# This table has one row and two columns.
# Each <div> in the first column is a review.
# If the table is absent, it means there are no reviews for this book.
reviews = URL(reviews).download(cached=True, throttle=20) # throttle = delay between crawls
reviews = DOM(reviews).by_id("productReviews")
if reviews is not None:
for review in reviews.by_tag("div"):
# We use a try-except statement to brute-force it:
# The <div>'s in the table do not have a class to search for,
# and there may be other <div>'s in-between, which end up in the except-block.
try:
# The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">.
score = review.by_class("swSprite")[0]
score = score.attributes["title"]
score = score.split(" ")[0]
score = float(score)
# The review is contained as plain text in the <div>.
text = ""
for child in review.children:
if child.type == "text":
text += child.source + " "
text = text.strip()
text = plaintext(text) # Remove HTML entities, tags, etc.
if text:
corpus.append((text, score))
print score
print text
print
except Exception, e:
#print e
pass
# Now and then, save the corpus of (review, score) items as a .csv file.
corpus.save("books-fr.csv")
# Can you think of other test data to mine for?
# Can you see why it would be useful to have different test sets?
# - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag?
# - How about hotel reviews + star rating from http://fr.hotels.com?
# - ...