-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraping.py
More file actions
112 lines (75 loc) · 2.84 KB
/
scraping.py
File metadata and controls
112 lines (75 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
import random
from bs4 import BeautifulSoup
from urllib import parse
from sentimentja import Analyzer
analyzer = Analyzer()
user_agents = ['Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11']
headers = {'User-Agent': random.choice(user_agents)}
def getSearchResIds(p):
search_url = 'https://news.yahoo.co.jp/search?p={}&ei=UTF-8&fr=news_sw&vaop=a&to=0&st=n&c_n=c_int'.format(p)
r = requests.get(search_url, headers = headers)
content = r.text
soup = BeautifulSoup(content, 'lxml')
results = soup.find(id = 'NSm').find_all(class_ = 'cf')
if len(results) == 0:
return {}
# result = results[9]
a_list = []
for result in results:
a = result.find(class_ = 't').find('a')
title = a.text
href = a['href']
parseResult = parse.urlparse(href)
param_dict = parse.parse_qs(parseResult.query)
a_list.append({ "title": title, "id": param_dict['a'][0] })
return a_list
def getCommentsUrl(id):
url = 'https://headlines.yahoo.co.jp/cm/main?d=' + id
r = requests.get(url, headers = headers)
content = r.text
soup = BeautifulSoup(content, 'lxml')
plugin = soup.find(class_ = 'news-comment-plugin')
if plugin == None:
return False
origin = 'https%3A%2F%2Fheadlines.yahoo.co.jp'
sort = plugin['data-sort']
order = plugin['data-order']
# 页数
page = plugin['data-page']
type = plugin['data-page-type']
keys = plugin['data-keys']
full_page_url = plugin['data-full-page-url']
# 数量
# comment_num = plugin['data-comment-num']
comment_num = 50
bkt = 'paid001'
grp = plugin['data-grp']
opttype = plugin['data-opttype']
url = 'https://news.yahoo.co.jp/comment/plugin/v1/full/?origin={}&sort={}&order={}&page={}&type={}&keys={}&full_page_url={}&comment_num={}&bkt={}&grp=hdl&opttype={}'.format(origin, sort, order, page, type, keys, full_page_url, comment_num, bkt, grp,opttype)
return url
def getComments(url):
r = requests.get(url, headers = headers)
content = r.text
soup = BeautifulSoup(content, 'lxml')
number = soup.find(class_ = 'num').find('dd').get_text()
if number == '0':
return { 'number': '0', 'items': [] }
list = soup.find(id = 'comment-list-item')
lis = list.find_all(class_ = 'commentListItem')
items = []
for item in lis:
comment = item.find(class_ = 'comment').find('span').get_text()
items.append(comment)
return { 'number': number, 'items': items }
def comments(id):
url = getCommentsUrl(id)
res = {}
if url == False:
res["comment_num"] = 0
res["comments"] = []
else:
comments = getComments(url)
res["comment_num"] = comments['number']
res["comments"] = analyzer.analyze(comments['items'])
return res