-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_file.py
94 lines (76 loc) · 4.13 KB
/
main_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Data mining project - "Analysis of Stack Exchange Websites"- main file
1. create databases and tables using ORM with MySQL
2. scrap data from multiple website (with or without multi-proccessing
3. enrich the database with data from an API request
3. insert data into dedicated tables in the database
Authors: Nir Barazida and Inbar Shirizly
"""
from src import logger, config, general, database
import argparse
from API.website_api import WebsiteAPI
from scraper import UserAnalysis
from scraper import UserScraper
import concurrent.futures
from tqdm import tqdm
import random
from itertools import repeat
@general.timer
def scrap_users(website_name, num_users_to_scrap):
"""
receives website name and scrap individual users data (via the classes generators in the related files)
the information scrapped is inserted to the database.
the function generates a random user for a sanity check , logs information that can be checked manually
when the user index reaches the last user needed (per website) finish code.
on the Multi Process mode, this function runs concurrently on different websites
:param website_name: domain name of the website that is been scrapped (str)
:param num_users_to_scrap: number of users to scrap in the following session
:return: None
"""
first_instance_to_scrap, index_first_page, index_first_instance_in_first_page = general.arrange_first_user_to_scrap(
website_name)
# commit website data enrichment from the API
database.commit_website_to_DB(WebsiteAPI(website_name).website_info)
logger.info(config.WEBSITE_SCRAPP_INFO.format(website_name, first_instance_to_scrap,
first_instance_to_scrap + num_users_to_scrap - 1))
random_user_to_check = random.randint(0, num_users_to_scrap - 1)
# create user link generator
user_page = UserAnalysis(website_name, index_first_page, index_first_instance_in_first_page)
user_links_generator = user_page.generate_users_links()
for num_user, link in enumerate(tqdm(user_links_generator, desc=f"{website_name}",
total=num_users_to_scrap, position=1, leave=False)):
# create a new user
user = UserScraper(link, website_name, first_instance_to_scrap)
# insert user to the database
database.insert_user_to_DB(user)
# log a sanity check for a random user
if num_user == random_user_to_check:
logger.info(config.SANITY_CHECK_STRING.format(link, website_name,
user._rank // config.NUM_INSTANCES_IN_PAGE, user._reputation_now))
# stop when reached to requested number of users
if num_user == num_users_to_scrap - 1:
break
@general.timer
def main():
# receiving arguments from the command line terminal for the scraping process
parser = argparse.ArgumentParser(description='Scraping users from Stack Exchange websites')
parser.add_argument('--num_users_to_scrap', help="Number of users to scrap", type=int, default=10)
parser.add_argument('--websites', help="Which Stack Exchange websites to scrap from", nargs='+',
default=['stackoverflow', 'askubuntu', 'math.stackexchange', 'superuser'],
choices={'stackoverflow', 'askubuntu', 'math.stackexchange', 'superuser'})
parser.add_argument("--multi_process",
help="To use Multi Process or basic for loop between the different websites, "
"default=False "
, type=general.bool_converter, default=False)
args = parser.parse_args()
logger.info(config.OPENING_STRING.format(config.DB_NAME, args.num_users_to_scrap, config.SLEEP_FACTOR, args.multi_process))
# Multi Process mode
if args.multi_process:
with concurrent.futures.ProcessPoolExecutor() as executer:
executer.map(scrap_users, *[args.websites, repeat(args.num_users_to_scrap)])
# for loop mode
else:
for website_name in args.websites:
scrap_users(website_name, args.num_users_to_scrap)
if __name__ == '__main__':
main()