forked from hamanhbui/neurips2021_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl_data.py
More file actions
38 lines (31 loc) · 1.28 KB
/
crawl_data.py
File metadata and controls
38 lines (31 loc) · 1.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
from bs4 import BeautifulSoup
import pickle
page = requests.get("https://nips.cc/Conferences/2021/AcceptedPapersInitial")
soup = BeautifulSoup(page.text, "html.parser")
url_id = []
for link in soup.find_all("div", {"class": "maincard narrower poster"}):
url_id.append(link.get("id").split("_")[1])
pp_names = []
pp_authors = []
for id in url_id:
pp_soup = BeautifulSoup(
requests.get("https://nips.cc/Conferences/2021/Schedule?showEvent=" + id).text, "html.parser"
)
pp_name = pp_soup.find("div", {"class": "maincardBody"})
pp_names.append(pp_name.get_text())
authors = []
for author in pp_soup.find_all("button", {"class": "btn btn-default"}):
at_id = author.get("onclick")
at_id = at_id[at_id.find("('") + 2 : at_id.find("')")]
at_soup = BeautifulSoup(
requests.get("https://neurips.cc/Conferences/2021/Schedule?showSpeaker=" + at_id).text, "html.parser"
)
at_name = at_soup.find("h3").get_text()
ins_name = at_soup.find("h4").get_text()
authors.append(at_name + " (" + ins_name + ")")
pp_authors.append(authors)
with open("pp_names.txt", "wb") as fp: # Pickling
pickle.dump(pp_names, fp)
with open("pp_authors.txt", "wb") as fp: # Pickling
pickle.dump(pp_authors, fp)