-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcli.py
More file actions
114 lines (100 loc) · 3.6 KB
/
cli.py
File metadata and controls
114 lines (100 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
CLI function to run the crawlers, compare curriculuns and manage the database.
Try: 'python cli.py --help' for more information.
"""
import asyncio
from os import environ
from logging import basicConfig, INFO, info, exception
from src.constants import COMPANY_INPUT, ROOT_DIR, RESOURCES_DIR
from sys import argv, path
from src.helper.commands import sanity_check_facade, help_facade_, overwrite_facade
from src.crawler.company import Company
from os import getcwd, system
from dotenv import load_dotenv
from src.helper.helper import get_career_links, initialize_table, read_file
from caqui.easy.server import Server
from src.crawler.company import CompanyInstance
load_dotenv() # take environment variables from .env.
MAX_CONCURRENCY = 5 # number of webdriver instances running
SEMAPHORE = asyncio.Semaphore(int(environ.get("MAX_CONCURRENCY", MAX_CONCURRENCY)))
system('export PATH="{}:$PATH"'.format(RESOURCES_DIR))
path.append(RESOURCES_DIR)
path.append(ROOT_DIR)
if environ.get("DEBUG") == "on":
basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
level=INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
else:
basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
level=INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
SERVER = Server()
async def get_all_positions(company=None):
try:
async with SEMAPHORE:
# Get data from real companies. Not covered by automated testes
# to avoid overload the real sites
return await overwrite_facade(company)
except Exception:
raise
async def get_all_links(company):
try:
async with SEMAPHORE:
# Get data from real companies. Not covered by automated testes
# to avoid overload the real sites
return await get_career_links(company)
except Exception:
raise
# Reference:
# https://stackoverflow.com/questions/48483348/how-to-limit-concurrency-with-python-asyncio
async def main(*args):
for arguments in args:
if "-h" in arguments or "--help" in arguments:
output = help_facade_()
info(output)
return output
if "--sanity-check" in arguments:
SERVER.start()
company_fake = {
"locator": "//a",
"url": "file:///" + getcwd() + "/src/resources/sanity_check.html#",
"active": "Y",
}
return await sanity_check_facade(CompanyInstance(company_fake))
if "--init" in arguments:
initialize_table()
return
if "--overwrite" in arguments:
SERVER.start()
tasks = []
companies = Company().get_all()
for company in companies:
tasks.append(asyncio.ensure_future(get_all_positions(company=company)))
await asyncio.gather(*tasks)
return
if "--getlinks" in arguments:
SERVER.start()
tasks = []
companies_url = read_file(COMPANY_INPUT, has_header=False)
for company in companies_url:
tasks.append(asyncio.ensure_future(get_all_links(company=company)))
await asyncio.gather(*tasks)
return
if "--clean-db" in arguments:
initialize_table()
return
exception("Invalid command. Try cli.py --help ")
if __name__ == "__main__":
loop = asyncio.new_event_loop()
try:
loop.run_until_complete(main(argv))
except Exception:
raise
finally:
SERVER.dispose()
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()