|
74 | 74 | # url for Missing Migrants data |
75 | 75 | # SOURCE_URL = "https://missingmigrants.iom.int/global-figures/{year}/xls" |
76 | 76 | # SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/{year}-{month}/{file_name}' |
77 | | -SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/{file_name}' |
| 77 | +SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/Missing_Migrants_Global_Figures_allData.xlsx' |
78 | 78 |
|
79 | 79 | # format of dates in source csv file |
80 | 80 | INPUT_DATE_FORMAT = '%Y-%m-%d' |
@@ -199,33 +199,38 @@ def processData(src_url, existing_ids): |
199 | 199 | year = datetime.datetime.today().year |
200 | 200 | month = f"{datetime.datetime.today().month:02d}" |
201 | 201 |
|
202 | | - # get excel file name |
203 | | - # Send a GET request to the web page |
204 | | - url = "https://missingmigrants.iom.int/downloads" |
205 | | - response = requests.get(url) |
206 | | - # Parse the HTML content |
207 | | - soup = BeautifulSoup(response.text, "html.parser") |
208 | | - # Find the link(s) with .xlsx extension |
209 | | - xlsx_links = soup.find_all("a", href=lambda href: href and href.endswith(".xlsx")) |
210 | | - # Extract the first file name from the link(s) |
211 | | - # The web page has two links with .xlsx extension, and their file names are the same |
212 | | - file_name = [link["href"].split("/")[-1] for link in xlsx_links][0] |
| 202 | + # # get excel file name |
| 203 | + # # Send a GET request to the web page |
| 204 | + # url = "https://missingmigrants.iom.int/downloads" |
| 205 | + # response = requests.get(url) |
| 206 | + # # Parse the HTML content |
| 207 | + # soup = BeautifulSoup(response.text, "html.parser") |
| 208 | + # # Find the link(s) with .xlsx extension |
| 209 | + # xlsx_links = soup.find_all("a", href=lambda href: href and href.endswith(".xlsx")) |
| 210 | + # # Extract the first file name from the link(s) |
| 211 | + # # The web page has two links with .xlsx extension, and their file names are the same |
| 212 | + # file_name = [link["href"].split("/")[-1] for link in xlsx_links][0] |
213 | 213 |
|
214 | 214 | # create an empty list to store unique ids of new data we will be sending to Carto table |
215 | 215 | new_ids = [] |
216 | 216 |
|
217 | 217 | # Retrieve and process new data; continue until the current year is |
218 | 218 | # older than the oldest year allowed in the table, set by the MAX_AGE variable |
219 | 219 | # while year > MAX_AGE.year: |
220 | | - logging.info("Fetching data from {}".format(src_url.format(year=year, month=month, file_name=file_name))) |
| 220 | + logging.info(f"Fetching data from {src_url}") |
221 | 221 | # generate the url and pull data for the selected year |
222 | 222 | try: |
223 | | - urllib.request.urlretrieve(src_url.format(year=year, month=month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx')) |
| 223 | + # urllib.request.urlretrieve(src_url.format(year=year, month=month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx')) |
| 224 | + req = urllib.request.Request(src_url, headers={'User-Agent': 'Mozilla/5.0'}) |
| 225 | + with urllib.request.urlopen(req) as response: |
| 226 | + data = response.read() |
| 227 | + with open(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'), "wb") as f: |
| 228 | + f.write(data) |
224 | 229 | except: |
225 | 230 | # try to pull last month's data |
226 | 231 | year = (datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)).year |
227 | 232 | month = f"{(datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)).month:02d}" |
228 | | - urllib.request.urlretrieve(src_url.format(year = year, month = month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx')) |
| 233 | + urllib.request.urlretrieve(src_url.format(year = year, month = month), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx')) |
229 | 234 | # convert excel file to csv |
230 | 235 | read_file = pd.read_excel(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'), sheet_name='Worksheet', engine = 'openpyxl') |
231 | 236 | read_file.to_csv(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.csv'), index = None, header=True) |
|
0 commit comments