Merge pull request #1217 from resource-watch/soc_018

weiqi-tori · web-flow · commit 3b07c06fb74f · 2025-04-02T11:57:42.000+08:00
update request methods
diff --git a/cli_044_global_land_temperature/contents/src/__init__.py b/cli_044_global_land_temperature/contents/src/__init__.py
@@ -169,7 +169,7 @@ def fetchDataFileName(url):
 
     # extract all the <a> tags within the html content. The <a> tags are used to mark links, so 
     # we will be able to find the files available for download marked with these tags.
-    links = soup.findAll('a')
+    links = soup.find_all('a')
     # There are some anchors (<a> tags) without href attribute
     # first filter your links for the existence of the href attribute
     # https://stackoverflow.com/questions/52398738/python-sort-to-avoid-keyerror-href?noredirect=1&lq=1
@@ -259,7 +259,13 @@ def processData(url, existing_ids, date_format='%Y-%m-%d %H:%M:%S'):
     # Get the link from source url for which we want to download data
     resource_location = fetchDataFileName(url)
     # get the data from source as a list of strings, with each string holding one line from the source data file
-    res_rows = tryRetrieveData(url, resource_location)
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    response = requests.get(resource_location, headers=headers)
+    if response.ok:
+        text_data = response.text
+        # Splitting the data into rows based on newlines
+        res_rows = text_data.splitlines()
+    
     # create an empty dictionary to store new data (data that's not already in our Carto table)
     new_data = {}
     # remove headers by deleting first five rows
diff --git a/soc_018_migrant_deaths/contents/src/__init__.py b/soc_018_migrant_deaths/contents/src/__init__.py
@@ -74,7 +74,7 @@
 # url for Missing Migrants data
 # SOURCE_URL = "https://missingmigrants.iom.int/global-figures/{year}/xls"
 # SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/{year}-{month}/{file_name}'
-SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/{file_name}'
+SOURCE_URL = 'https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/report-migrant-incident/Missing_Migrants_Global_Figures_allData.xlsx'
 
 # format of dates in source csv file
 INPUT_DATE_FORMAT = '%Y-%m-%d'
@@ -199,33 +199,38 @@ def processData(src_url, existing_ids):
     year = datetime.datetime.today().year
     month = f"{datetime.datetime.today().month:02d}" 
 
-    # get excel file name
-    # Send a GET request to the web page
-    url = "https://missingmigrants.iom.int/downloads"
-    response = requests.get(url)
-    # Parse the HTML content
-    soup = BeautifulSoup(response.text, "html.parser")
-    # Find the link(s) with .xlsx extension
-    xlsx_links = soup.find_all("a", href=lambda href: href and href.endswith(".xlsx"))
-    # Extract the first file name from the link(s)
-    # The web page has two links with .xlsx extension, and their file names are the same
-    file_name = [link["href"].split("/")[-1] for link in xlsx_links][0]
+    # # get excel file name
+    # # Send a GET request to the web page
+    # url = "https://missingmigrants.iom.int/downloads"
+    # response = requests.get(url)
+    # # Parse the HTML content
+    # soup = BeautifulSoup(response.text, "html.parser")
+    # # Find the link(s) with .xlsx extension
+    # xlsx_links = soup.find_all("a", href=lambda href: href and href.endswith(".xlsx"))
+    # # Extract the first file name from the link(s)
+    # # The web page has two links with .xlsx extension, and their file names are the same
+    # file_name = [link["href"].split("/")[-1] for link in xlsx_links][0]
 
     # create an empty list to store unique ids of new data we will be sending to Carto table
     new_ids = []
 
     # Retrieve and process new data; continue until the current year is 
     # older than the oldest year allowed in the table, set by the MAX_AGE variable
     # while year > MAX_AGE.year:
-    logging.info("Fetching data from {}".format(src_url.format(year=year, month=month, file_name=file_name)))
+    logging.info(f"Fetching data from {src_url}")
     # generate the url and pull data for the selected year
     try:
-        urllib.request.urlretrieve(src_url.format(year=year, month=month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
+        # urllib.request.urlretrieve(src_url.format(year=year, month=month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
+        req = urllib.request.Request(src_url, headers={'User-Agent': 'Mozilla/5.0'})
+        with urllib.request.urlopen(req) as response:
+            data = response.read()
+            with open(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'), "wb") as f:
+                f.write(data)
     except:
         # try to pull last month's data
         year = (datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)).year
         month = f"{(datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)).month:02d}"
-        urllib.request.urlretrieve(src_url.format(year = year, month = month, file_name=file_name), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
+        urllib.request.urlretrieve(src_url.format(year = year, month = month), os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'))
     # convert excel file to csv
     read_file = pd.read_excel(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.xlsx'), sheet_name='Worksheet', engine = 'openpyxl')
     read_file.to_csv(os.path.join(DATA_DIR, f'MissingMigrants-Global-{year}-{month}.csv'), index = None, header=True)