-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathauto_verify_links.py
240 lines (211 loc) · 8.56 KB
/
auto_verify_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""
This script crawls candidate URLs for municipalities websites and
checks if they are active and likely to be the city hall or
city council portals.
Usage:
python auto_verify_links.py
For instructions use:
python auto_verify_links.py --help
Este script navega nas URLs candidatas a sites dos municípios e
verifica se elas estão ativas e são prováveis portais das prefeituras
e câmaras municipais.
"""
import argparse
from datetime import datetime
from functools import partial
import logging
import multiprocessing
import os
from typing import Sequence, List
import pandas as pd
from tqdm import tqdm
from validation.verify_links import (healthy_link, get_title_and_type,
get_candidate_links, get_output_to_be_merged, store_csv)
INPUT_FOLDER = '../../data/unverified'
INPUT_FILE = 'municipality-website-candidate-links.csv'
MAX_SIMULTANEOUS = 10
MAX_QUANTITY = 0
OUTPUT_FOLDER = '../../data/valid'
def verify_city_links(candidates: pd.DataFrame, code: int) -> List[dict]:
"""Verify links for a city with a given code.
Args:
candidates (pd.DataFrame): The dataframe slice with the link
candidates to verify.
code (int): The IBGE municipality code associated with the link.
Returns:
List(dict): A list of dictionaries containing information about
the detected link.
"""
verified_links = []
city_links = candidates[candidates.code == code]
for link in city_links.link.unique():
working_link = healthy_link(link)
if working_link:
_, link_type = get_title_and_type(
working_link,
candidates[candidates.link==link]
)
if link_type is not None:
verified_link = {
'code': code,
'link': working_link.url, # update if redirected
'link_type': link_type,
'name': city_links.name.iloc[0],
'uf': city_links.uf.iloc[0],
'last_checked': datetime.utcnow()
}
verified_links.append(verified_link)
return verified_links
def parse_cli() -> dict:
"""Parses the command line interface.
Returns:
dict: A dict containing the values for input_folder, input_file,
data_package_path, max_quantity, max_simultaneous
"""
parser = argparse.ArgumentParser(
description='''Crawls candidate URLs for municipalities websites and checks
if they are active and likely to be the city hall or city council portals.'''
)
parser.add_argument('input',
help='input file in CSV format',
default='',
nargs='?',
)
parser.add_argument('output',
help=('output folder for the CSV '
'(must have a datapackage.json with a schema)'),
default='',
nargs='?',
)
parser.add_argument('-q', '--quantity',
metavar='int', type=int,
help='maximum quantity of cities to process',
default=0,
)
parser.add_argument('-p', '--processes',
metavar='int', type=int,
help='number of processes (parallel downloads) to use',
default=8,
)
params = {}
args = parser.parse_args()
if args.input:
params['input_folder'] = os.path.dirname(args.input)
params['input_file'] = os.path.basename(args.input)
else: # use default values
params['input_folder'] = INPUT_FOLDER
params['input_file'] = INPUT_FILE
if args.output:
if not os.path.exists(args.output):
raise FileNotFoundError(f'Folder not found: {args.output}')
output_folder = args.output
else: # use default value
output_folder = OUTPUT_FOLDER
params['data_package_path'] = os.path.join(output_folder, 'datapackage.json')
if not os.path.exists(params['data_package_path']):
raise FileNotFoundError(
f'datapackage.json not found in folder: {args.output}')
if args.quantity:
params['max_quantity'] = args.quantity
else: # use default value
params['max_quantity'] = MAX_QUANTITY
if args.processes:
params['max_simultaneous'] = args.processes
else: # use default value
params['max_simultaneous'] = MAX_SIMULTANEOUS
return params
def auto_verify(input_folder: str, input_file: str, data_package_path: str,
max_quantity: int, max_simultaneous: int) -> pd.DataFrame:
"""Automatically verifies links and try to infer the link type for
each.
A Python function that does the same job as the script that is run
from the command line.
Args:
input_folder (str): The folder containing the input table.
input_file (str): Name of the file containing the input table in
csv format.
data_package_path (str): Path to the datapackage.json file.
max_quantity (int): Maximum quantity of links to check.
max_simultaneous (int): Maximum quantity of simultaneous (in
parallel) checks.
Returns:
pd.DataFrame: Pandas dataframe containing the verified links.
"""
candidates = get_candidate_links(
file_path=os.path.join(input_folder, input_file),
max_quantity=max_quantity)
codes = candidates.code.unique()
new_links = pd.DataFrame(columns=candidates.columns)
new_links['last_checked'] = pd.Series(dtype='datetime64[ns]')
def in_chunks(seq: Sequence, size: int) -> Sequence:
"""Returns a Sequence spaced in chunks of arbitrary size.
Args:
seq (Sequence): The Sequence to separate in chunks.
size (int): The chunk size.
Returns:
Sequence: The Sequence of chunks resulting from the separation.
"""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
pool = multiprocessing.Pool(processes=max_simultaneous)
with tqdm(total=len(codes)) as progress_bar:
logging.info('Cralwing candidate URLs for %d cities...', len(codes))
for chunk in in_chunks(codes, max_simultaneous):
results = pool.map(partial(verify_city_links, candidates), chunk)
for result in results:
for verified_link in result:
new_links.loc[len(new_links)] = verified_link
progress_bar.update(max_simultaneous)
# read resource to be updated
table = get_output_to_be_merged(data_package_path)
# prepare column names
new_links.rename(columns={
'uf': 'state_code',
'code': 'municipality_code',
'name': 'municipality',
'link_type': 'branch',
'link': 'url',
'last_checked': 'last-verified-auto'
}, inplace=True)
# map values
new_links.branch = new_links.branch.str.replace('prefeitura', 'executive')
new_links.branch = new_links.branch.str.replace('camara', 'legislative')
new_links['sphere'] = 'municipal'
logging.info('Updating values...')
for _, result in new_links.iterrows():
# get existing data in file to be updated
existing_data = table.loc[
(table.municipality_code == result['municipality_code']) &
(table.branch == result['branch'])
]
if len(existing_data) > 0:
index = existing_data.index[0]
row = existing_data.loc[[index,]]
changed_columns = ['sphere', 'branch', 'url', 'last-verified-auto']
for key in changed_columns:
row.loc[index, key] = result[key] # update the values
print('\n Old: ', table.loc[index, changed_columns])
print('\n New: ', row.loc[index, changed_columns])
table.loc[index, changed_columns] = row.loc[index, changed_columns]
else:
print(f'\n Adding at position {len(table)}: ', result)
index = len(table)
for key in result.keys():
table.loc[index, key] = result[key]
# enforce correct data types
table['municipality_code'] = table['municipality_code'].astype('Int64')
table['last-verified-auto'] = pd.to_datetime(
table['last-verified-auto'], utc=True)
# remove duplicate entries,
# take into account only url column,
# keep last entry to preserve the last-verified-auto timestamp
table.drop_duplicates(subset='url', keep='last', inplace=True)
table.sort_values(
by=['sphere', 'state_code', 'municipality', 'branch'],
inplace=True)
# returns the results
return table
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
options = parse_cli()
table = auto_verify(**options)
store_csv(table, options['data_package_path'])