Skip to content

Commit 6c20507

Browse files
committed
update request methods
1 parent 0684a71 commit 6c20507

File tree

2 files changed

+28
-17
lines changed

2 files changed

+28
-17
lines changed

cli_044_global_land_temperature/contents/src/__init__.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def fetchDataFileName(url):
169169

170170
# extract all the <a> tags within the html content. The <a> tags are used to mark links, so
171171
# we will be able to find the files available for download marked with these tags.
172-
links = soup.findAll('a')
172+
links = soup.find_all('a')
173173
# There are some anchors (<a> tags) without href attribute
174174
# first filter your links for the existence of the href attribute
175175
# https://stackoverflow.com/questions/52398738/python-sort-to-avoid-keyerror-href?noredirect=1&lq=1
@@ -259,7 +259,13 @@ def processData(url, existing_ids, date_format='%Y-%m-%d %H:%M:%S'):
259259
# Get the link from source url for which we want to download data
260260
resource_location = fetchDataFileName(url)
261261
# get the data from source as a list of strings, with each string holding one line from the source data file
262-
res_rows = tryRetrieveData(url, resource_location)
262+
headers = {'User-Agent': 'Mozilla/5.0'}
263+
response = requests.get(resource_location, headers=headers)
264+
if response.ok:
265+
text_data = response.text
266+
# Splitting the data into rows based on newlines
267+
res_rows = text_data.splitlines()
268+
263269
# create an empty dictionary to store new data (data that's not already in our Carto table)
264270
new_data = {}
265271
# remove headers by deleting first five rows

soc_018_migrant_deaths/contents/src/__init__.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
# url for Missing Migrants data
7575
# SOURCE_URL = "https://missingmigrants.iom.int/global-figures/{year}/xls"
7676
# SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/{year}-{month}/{file_name}'
77-
SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/{file_name}'
77+
SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/Missing_Migrants_Global_Figures_allData.xlsx'
7878

7979
# format of dates in source csv file
8080
INPUT_DATE_FORMAT = '%Y-%m-%d'
@@ -199,33 +199,38 @@ def processData(src_url, existing_ids):
199199
year = datetime.datetime.today().year
200200
month = f"{datetime.datetime.today().month:02d}"
201201

202-
# get excel file name
203-
# Send a GET request to the web page
204-
url = "https://missingmigrants.iom.int/downloads"
205-
response = requests.get(url)
206-
# Parse the HTML content
207-
soup = BeautifulSoup(response.text, "html.parser")
208-
# Find the link(s) with .xlsx extension
209-
xlsx_links = soup.find_all("a", href=lambda href: href and href.endswith(".xlsx"))
210-
# Extract the first file name from the link(s)
211-
# The web page has two links with .xlsx extension, and their file names are the same
212-
file_name = [link["href"].split("/")[-1] for link in xlsx_links][0]
202+
# # get excel file name
203+
# # Send a GET request to the web page
204+
# url = "https://missingmigrants.iom.int/downloads"
205+
# response = requests.get(url)
206+
# # Parse the HTML content
207+
# soup = BeautifulSoup(response.text, "html.parser")
208+
# # Find the link(s) with .xlsx extension
209+
# xlsx_links = soup.find_all("a", href=lambda href: href and href.endswith(".xlsx"))
210+
# # Extract the first file name from the link(s)
211+
# # The web page has two links with .xlsx extension, and their file names are the same
212+
# file_name = [link["href"].split("/")[-1] for link in xlsx_links][0]
213213

214214
# create an empty list to store unique ids of new data we will be sending to Carto table
215215
new_ids = []
216216

217217
# Retrieve and process new data; continue until the current year is
218218
# older than the oldest year allowed in the table, set by the MAX_AGE variable
219219
# while year > MAX_AGE.year:
220-
logging.info("Fetching data from {}".format(src_url.format(year=year, month=month, file_name=file_name)))
220+
logging.info(f"Fetching data from {src_url}")
221221
# generate the url and pull data for the selected year
222222
try:
223-
urllib.request.urlretrieve(src_url.format(year=year, month=month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
223+
# urllib.request.urlretrieve(src_url.format(year=year, month=month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
224+
req = urllib.request.Request(src_url, headers={'User-Agent': 'Mozilla/5.0'})
225+
with urllib.request.urlopen(req) as response:
226+
data = response.read()
227+
with open(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'), "wb") as f:
228+
f.write(data)
224229
except:
225230
# try to pull last month's data
226231
year = (datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)).year
227232
month = f"{(datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)).month:02d}"
228-
urllib.request.urlretrieve(src_url.format(year = year, month = month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
233+
urllib.request.urlretrieve(src_url.format(year = year, month = month), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
229234
# convert excel file to csv
230235
read_file = pd.read_excel(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'), sheet_name='Worksheet', engine = 'openpyxl')
231236
read_file.to_csv(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.csv'), index = None, header=True)

0 commit comments

Comments
 (0)