-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtmlCrawler.py
140 lines (122 loc) · 5.48 KB
/
htmlCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 17 20:29:41 2020
@author: crystalhansen
"""
import bs4 as bs #beautifulSoup
import urllib.request
from datetime import datetime, timedelta
dt = datetime.now() #+ timedelta(hours=1) #sadds an hour for a condition of time is less than one hour ahead
d = dt.strftime("%m-%d-%Y_%H-%M-%S")
print('htmlCrawler Amazon')
## Our most popular products based on sales. Updated hourly
## best sellers
#best sellers in beauty
beautyhtml= urllib.request.urlopen('https://www.amazon.ca/Best-Sellers-Beauty/zgbs/beauty/ref=zg_bs_nav_0')
beautySoup = bs.BeautifulSoup(beautyhtml,'lxml')
##best sellers in skin care under beauty and personal care
beautySkinCarehtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Skin-Care-Products/zgbs/beauty/6344740011/ref=zg_bs_nav_beauty_1_beauty")
skinCareSoup = bs.BeautifulSoup(beautySkinCarehtml,'lxml')
## best sellers in facial skin care
facialSkinCarehtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Facial-Skin-Care-Products/zgbs/beauty/6344751011/ref=zg_bs_nav_beauty_2_6344740011")
facialSkinCareSoup = bs.BeautifulSoup(facialSkinCarehtml,'lxml')
# best sellers in eye treatment
eyeTreatmentHtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Eye-Treatment-Products/zgbs/beauty/6344747011/ref=zg_bs_nav_beauty_2_6344740011")
eyeTreatmentHtmlSoup = bs.BeautifulSoup(eyeTreatmentHtml,'lxml')
# body care
bodyCarehtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Body-Skin-Care-Products/zgbs/beauty/6344741011/ref=zg_bs_nav_beauty_3_6344789011")
bodyCareSoup = bs.BeautifulSoup(bodyCarehtml,'lxml')
# hands feet and nails
handsFeetNailshtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Hand-Feet-Nail-Care-Products/zgbs/beauty/6344773011/ref=zg_bs_nav_beauty_2_6344740011")
handsFeetNailsSoup = bs.BeautifulSoup(handsFeetNailshtml,'lxml')
# lip care
lipCarehtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Lip-Care-Products/zgbs/beauty/6371149011/ref=zg_bs_nav_beauty_2_6344740011")
lipCareSoup = bs.BeautifulSoup(lipCarehtml,'lxml')
# skin care sets
SkinCareSetshtml= urllib.request.urlopen("https://www.amazon.ca/Best-Sellers-Beauty-Skin-Care-Sets/zgbs/beauty/6344789011/ref=zg_bs_nav_beauty_2_6344740011")
SkinCareSetsSoup = bs.BeautifulSoup(SkinCareSetshtml,'lxml')
# title of the page
print(skinCareSoup.title)
#
## get attributes:
#print(soup.title.name)
#
## get values:
print(skinCareSoup.title.string)
#
## beginning navigation:
#print(soup.title.parent.name)
# getting specific values:
#print(soup.p)
#
#print(soup.find_all('p'))
#
#for paragraph in soup.find_all('p'):
# print(paragraph.string)
# print(str(paragraph.text))
# #li class zg-item-immersion
#for li in soup.find_all('li'):
# print(li.string)
#
# print(soup.get_text())
fileN= "beauty/beautyPersonalCare/beautyAmazon_" + d +".txt"
f=open(fileN,"w")
f.write(beautySoup.title.string +"\n ")
for li in beautySoup.find_all('li', class_='zg-item-immersion'):
#print(li.text.strip()+", ")
f.write(li.text.strip()+"; \n")
f.close()
fileN= "beauty/skinCare/beautySkinCareAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
f.write(skinCareSoup.title.string +"\n ")
for li in skinCareSoup.find_all('li', class_='zg-item-immersion'):
#print(li.text.strip()+"; ")
f.write(li.text.strip()+"; \n")
f.close()
fileN= "beauty/facialSkinCare/beautyFacialSkinCareAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
f.write(facialSkinCareSoup.title.string)
for lifs in facialSkinCareSoup.find_all('li', class_='zg-item-immersion'):
# print(lifs.text.strip()+", ")
f.write(lifs.text.strip()+"; \n")
f.close()
fileN= "beauty/eyeTreatment/beautyEyeTreatmentCareAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
f.write(eyeTreatmentHtmlSoup.title.string)
for lifs in eyeTreatmentHtmlSoup.find_all('li', class_='zg-item-immersion'):
#print(lifs.text.strip()+", ")
f.write(lifs.text.strip()+"; \n")
f.close()
fileN= "beauty/bodyCare/beautyBodyCareAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
#f = open("bodyCareAmazon.txt","w") #open file with name test.txt
f.write(bodyCareSoup.title.string )
for lifs in bodyCareSoup.find_all('li', class_='zg-item-immersion'):
#print(lifs.text.strip()+", ")
f.write(lifs.text.strip()+"; \n")
f.close()
fileN= "beauty/handsFeet/beautyHandsFeetCareAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
#f = open("HandsFeetCareAmazon.txt","w") #open file with name test.txt
f.write(handsFeetNailsSoup.title.string)
for lifs in handsFeetNailsSoup.find_all('li', class_='zg-item-immersion'):
#print(lifs.text.strip()+", ")
f.write(lifs.text.strip()+"; \n")
f.close()
fileN= "beauty/lipCare/beautyLipCareAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
#f = open("LipCareAmazon.txt","w") #open file with name test.txt
f.write(lipCareSoup.title.string)
for lifs in lipCareSoup.find_all('li', class_='zg-item-immersion'):
#print(lifs.text.strip()+", ")
f.write(lifs.text.strip()+"; \n")
f.close()
fileN= "beauty/skinCareSets/beautySkinCareSetsAmazon_" + d +".txt"
f = open(fileN,"w") #open file with name test.txt
#f = open("SkinCareSetsAmazon.txt","w") #open file with name test.txt
f.write(SkinCareSetsSoup.title.string)
for lifs in SkinCareSetsSoup.find_all('li', class_='zg-item-immersion'):
#print(lifs.text.strip()+", ")
f.write(lifs.text.strip()+"; \n")
f.close()