From a6883e8446769cce6e5447935e70ca4186e01356 Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Tue, 19 Dec 2023 06:28:01 -1000 Subject: [PATCH 1/8] attempt_gsheet_method add function --- parsons/google/google_sheets.py | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index 5890763a01..336e5601cb 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -7,6 +7,9 @@ import gspread from google.oauth2.service_account import Credentials +from gspread.exceptions import APIError +from requests.exceptions import HTTPError, ReadTimeout +import time logger = logging.getLogger(__name__) @@ -395,6 +398,68 @@ def format_cells(self, spreadsheet_id, range, cell_format, worksheet=0): ws.format(range, cell_format) logger.info("Formatted worksheet") + def attempt_gsheet_method(self, method, i=1, max=6, wait_time=15, **kwargs): + """ + The Google Sheets API has notoriously strict rate limits (e.g. 60 calls per minute). This + function calls itself (i.e. is recursive) to help configure wait times and retry attempts + needed to wait out rate limit errors instead of letting them derail a script. + `Args:` + method: str + The name of the Parsons GoogleSheets method to be attempted + i: int + Where to start the retry count - defaults to 0; mostly needed for recursive calls + max: int + How many attempts to make before giving up - defaults to 4 + wait_time: int + Number of seconds to wait between attempts - defaults to 15 + kwargs: dict + Any arguments required by `method` - note that positional args will have to be named + `Returns:` + Whatever `method` is supposed to return + + """ + + # Recursively account for nested methods as needed + nested_methods = method.split(".") + + if len(nested_methods) == 1: + final_method = self + else: + final_method = self[nested_methods[0]] + nested_methods.pop(0) + + try: + + # If final_method isn't callable, then the API call is made in the loop, not below + for m in nested_methods: + final_method = getattr(final_method, m) + + # Using getattr allows the method/attribute to be user-provided + if callable(final_method): + output = final_method(**kwargs) + else: + output = final_method + + except (APIError, HTTPError, ReadTimeout, ConnectionError) as e: + # Lets get the ordinals right, because why not + if i % 10 == 1: + ordinal = "st" + elif i % 10 == 2: + ordinal = "nd" + else: + ordinal = "th" + + logger.info(f"trying to {method} for the {i}{ordinal} time") + if i < max: + time.sleep(wait_time) + i += 1 + output = self.attempt_gsheet_method(method, i, max, wait_time, **kwargs) + + else: + raise e + + return output + def read_sheet(self, spreadsheet_id, sheet_index=0): # Deprecated method v0.14 of Parsons. From 07d4da76c1b233e82e0e6b78fb7d171a7cbb3a79 Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Tue, 19 Dec 2023 08:57:58 -1000 Subject: [PATCH 2/8] add unit test --- parsons/google/google_sheets.py | 81 ++++++++++++++++++++++++++ test/test_google/test_google_sheets.py | 27 +++++++++ 2 files changed, 108 insertions(+) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index 336e5601cb..9d1c319183 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -10,6 +10,7 @@ from gspread.exceptions import APIError from requests.exceptions import HTTPError, ReadTimeout import time +import utilities logger = logging.getLogger(__name__) @@ -460,6 +461,86 @@ def attempt_gsheet_method(self, method, i=1, max=6, wait_time=15, **kwargs): return output + def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): + """ + Combines data from multiple Google Sheets into a Parsons Table. + `Args:` + sheet_ids: str, list + The IDs of the Google Spreadsheets with that data to be combined. Can be a + comma-separated string of IDs or a list. + + worksheet_id: str + If None, the first worksheet (ID = 0) is assumed. + + `Returns:` obj + Parsons Table containing the concatenated data from all the sheets. + """ + id_col = "sheet_id" + sheet_id_list = [] + + # Parse different possible sheet_ids types + if isinstance(sheet_ids, list): + # Already a list! + sheet_id_list = sheet_ids + + elif "," in sheet_ids: + # Comma-separated string + sheet_id_list = [x.strip() for x in sheet_ids.split(",")] + + else: + raise ValueError(f"{sheet_ids} is not a valid string or list GSheet IDs") + + # Non-DB table options yield a list, convert to Parsons table with default worksheet col + if sheet_id_list: + sheet_id_tbl = Table( + [{"sheet_id": x, "worksheet_id": 0} for x in sheet_id_list] + ) + + if not worksheet_id: + worksheet_id = "worksheet_id" + + # Empty table to accumulate data from spreadsheets + combined = Table() + + # Set for path to temp file to keep storage/memory in check for large lists + temp_files = [] + + logger.info( + f"Found {sheet_id_tbl.num_rows} Spreadsheets. Looping to get data from each one." + ) + + for sheet_id in sheet_id_tbl: + + # Keep a lid on how many temp files result from materializing below + if len(temp_files) > 1: + utilities.files.close_temp_file(temp_files[0]) + temp_files.remove(temp_files[0]) + + # Grab the sheet's data + data = self.attempt_gsheet_method( + "get_worksheet", + max=10, + wait_time=60, + spreadsheet_id=sheet_id[id_col], + worksheet=sheet_id[worksheet_id], + ) + + # Add the sheet ID as a column + data.add_column("spreadsheet_id", sheet_id[id_col]) + + # Retrieve sheet title (with attempts to handle rate limits) and add as a column + globals()["sheet_obj"] = self.attempt_gsheet_method( + "gs.gspread_client.open_by_key", key=sheet_id[id_col] + ) + sheet_title = str(self.attempt_gsheet_method("sheet_obj.title")) + data.add_column("spreadsheet_title", sheet_title) + + # Accumulate and materialize + combined.concat(data) + temp_files.append(combined.materialize_to_file()) + + return combined + def read_sheet(self, spreadsheet_id, sheet_index=0): # Deprecated method v0.14 of Parsons. diff --git a/test/test_google/test_google_sheets.py b/test/test_google/test_google_sheets.py index 7df50ddf94..47ac4b9be8 100644 --- a/test/test_google/test_google_sheets.py +++ b/test/test_google/test_google_sheets.py @@ -164,3 +164,30 @@ def test_share_spreadsheet(self): self.spreadsheet_id ) self.assertIn("bob@bob.com", permissions["emailAddress"]) + + def test_combine_multiple_sheet_data_attempt_gsheet_method(self): + spreadsheet_id_a = self.google_sheets.create_spreadsheet("parsons_test_01") + test_table_a = Table( + [ + {"first": "Bob", "last": "Smith"}, + {"first": "Sue", "last": "Doe"}, + ] + ) + self.google_sheets.overwrite_sheet(spreadsheet_id_a, test_table_a) + + spreadsheet_id_b = self.google_sheets.create_spreadsheet("parsons_test_02") + test_table_b = Table( + [ + {"first": "Ted", "last": "Smith"}, + {"first": "Susan", "last": "Kerry"}, + ] + ) + self.google_sheets.overwrite_sheet(spreadsheet_id_b, test_table_b) + + combined_data = self.google_sheets.combine_multiple_sheet_data( + [spreadsheet_id_a, spreadsheet_id_b] + ) + + self.assertEqual( + combined_data.num_rows, test_table_a.num_rows + test_table_b.num_rows + ) From f629af9764a3f786a4abbda629b36d77bba79096 Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Mon, 20 May 2024 03:48:35 -1000 Subject: [PATCH 3/8] some pr fix requests --- parsons/google/google_sheets.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index 9d1c319183..a18807553b 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -450,11 +450,11 @@ def attempt_gsheet_method(self, method, i=1, max=6, wait_time=15, **kwargs): else: ordinal = "th" - logger.info(f"trying to {method} for the {i}{ordinal} time") + logger.debug(f"trying to {method} for the {i}{ordinal} time") if i < max: time.sleep(wait_time) i += 1 - output = self.attempt_gsheet_method(method, i, max, wait_time, **kwargs) + return self.attempt_gsheet_method(method, i, max, wait_time, **kwargs) else: raise e @@ -469,7 +469,7 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): The IDs of the Google Spreadsheets with that data to be combined. Can be a comma-separated string of IDs or a list. - worksheet_id: str + worksheet_id: str (optional) If None, the first worksheet (ID = 0) is assumed. `Returns:` obj @@ -493,7 +493,7 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): # Non-DB table options yield a list, convert to Parsons table with default worksheet col if sheet_id_list: sheet_id_tbl = Table( - [{"sheet_id": x, "worksheet_id": 0} for x in sheet_id_list] + [{id_col: x, "worksheet_id": 0} for x in sheet_id_list] ) if not worksheet_id: @@ -529,10 +529,11 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): data.add_column("spreadsheet_id", sheet_id[id_col]) # Retrieve sheet title (with attempts to handle rate limits) and add as a column - globals()["sheet_obj"] = self.attempt_gsheet_method( + self.__sheet_obj = self.attempt_gsheet_method( "gs.gspread_client.open_by_key", key=sheet_id[id_col] ) sheet_title = str(self.attempt_gsheet_method("sheet_obj.title")) + del self.__sheet_obj data.add_column("spreadsheet_title", sheet_title) # Accumulate and materialize From be4caba1bb3f9e767b9d19081585e81e75597c1a Mon Sep 17 00:00:00 2001 From: Shauna Date: Thu, 9 May 2024 15:16:58 -0400 Subject: [PATCH 4/8] Update mac runner to macos-12 instead of mac-latest (#1049) Hopefully this will fix the breaking tests that are stalling PR merges --- .github/workflows/tests-mac.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests-mac.yml b/.github/workflows/tests-mac.yml index 7d5595eda1..aad1a1d9b7 100644 --- a/.github/workflows/tests-mac.yml +++ b/.github/workflows/tests-mac.yml @@ -17,7 +17,7 @@ jobs: python-version: ['3.8', '3.11'] limited-dependencies: ['','TRUE'] - runs-on: macos-latest + runs-on: macos-12 steps: From d49e69b3881f31e9fb3ef838f4da16be8d606c2e Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Mon, 20 May 2024 04:18:52 -1000 Subject: [PATCH 5/8] resolve a comment --- parsons/google/google_sheets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index df7dbe9a3e..961aaf932b 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -541,9 +541,7 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): # Non-DB table options yield a list, convert to Parsons table with default worksheet col if sheet_id_list: - sheet_id_tbl = Table( - [{id_col: x, "worksheet_id": 0} for x in sheet_id_list] - ) + sheet_id_tbl = [{"sheet_id": x, "worksheet_id": 0} for x in sheet_id_list] if not worksheet_id: worksheet_id = "worksheet_id" From 54e17ebce1c805b4c645afd24b3f4f290bb402d5 Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Mon, 20 May 2024 04:24:23 -1000 Subject: [PATCH 6/8] test fialures source --- parsons/google/google_sheets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index 961aaf932b..0794f4f0fc 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -576,9 +576,7 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): data.add_column("spreadsheet_id", sheet_id[id_col]) # Retrieve sheet title (with attempts to handle rate limits) and add as a column - self.__sheet_obj = self.attempt_gsheet_method( - "gs.gspread_client.open_by_key", key=sheet_id[id_col] - ) + self.__sheet_obj = self.gspread_client.open_by_key(sheet_id[id_col]) sheet_title = str(self.attempt_gsheet_method("sheet_obj.title")) del self.__sheet_obj data.add_column("spreadsheet_title", sheet_title) From cf004714099b7760544fc0d7712849a2f594ab2e Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Mon, 20 May 2024 05:13:09 -1000 Subject: [PATCH 7/8] more pr comment fixes --- parsons/google/google_sheets.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index 0794f4f0fc..b4b4db8a6e 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -513,6 +513,11 @@ def attempt_gsheet_method(self, method, i=1, max=6, wait_time=15, **kwargs): def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): """ Combines data from multiple Google Sheets into a Parsons Table. + The spreadsheets will be treated as if they are concatenated, meaning columns would + need to align positionally with matching data types. + This function also adds a spreedsheet_id and spreadsheet_title + columns to the resulting table. + `Args:` sheet_ids: str, list The IDs of the Google Spreadsheets with that data to be combined. Can be a @@ -571,7 +576,6 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): spreadsheet_id=sheet_id[id_col], worksheet=sheet_id[worksheet_id], ) - # Add the sheet ID as a column data.add_column("spreadsheet_id", sheet_id[id_col]) @@ -584,7 +588,6 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): # Accumulate and materialize combined.concat(data) temp_files.append(combined.materialize_to_file()) - return combined def read_sheet(self, spreadsheet_id, sheet_index=0): From 8b47709f5289f0660d581e829b6cd9bc91115edb Mon Sep 17 00:00:00 2001 From: sharinetmc Date: Tue, 28 May 2024 00:29:59 -1000 Subject: [PATCH 8/8] recursion pr comments --- parsons/google/google_sheets.py | 74 ++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/parsons/google/google_sheets.py b/parsons/google/google_sheets.py index b4b4db8a6e..b152ac9e8d 100644 --- a/parsons/google/google_sheets.py +++ b/parsons/google/google_sheets.py @@ -448,7 +448,7 @@ def format_cells(self, spreadsheet_id, range, cell_format, worksheet=0): ws.format(range, cell_format) logger.info("Formatted worksheet") - def attempt_gsheet_method(self, method, i=1, max=6, wait_time=15, **kwargs): + def attempt_gsheet_method(self, method, max=6, wait_time=15, **kwargs): """ The Google Sheets API has notoriously strict rate limits (e.g. 60 calls per minute). This function calls itself (i.e. is recursive) to help configure wait times and retry attempts @@ -469,46 +469,47 @@ def attempt_gsheet_method(self, method, i=1, max=6, wait_time=15, **kwargs): """ - # Recursively account for nested methods as needed - nested_methods = method.split(".") + def inner_attempt_gsheet_method(method, i, max=6, wait_time=15, **kwargs): + # Recursively account for nested methods as needed + nested_methods = method.split(".") - if len(nested_methods) == 1: - final_method = self - else: - final_method = self[nested_methods[0]] - nested_methods.pop(0) + if len(nested_methods) == 1: + final_method = self + else: + final_method = self[nested_methods[0]] + nested_methods.pop(0) - try: + try: - # If final_method isn't callable, then the API call is made in the loop, not below - for m in nested_methods: - final_method = getattr(final_method, m) + # If final_method isn't callable, then the API call is made in the loop, not below + for m in nested_methods: + final_method = getattr(final_method, m) - # Using getattr allows the method/attribute to be user-provided - if callable(final_method): - output = final_method(**kwargs) - else: - output = final_method - - except (APIError, HTTPError, ReadTimeout, ConnectionError) as e: - # Lets get the ordinals right, because why not - if i % 10 == 1: - ordinal = "st" - elif i % 10 == 2: - ordinal = "nd" - else: - ordinal = "th" + # Using getattr allows the method/attribute to be user-provided + if callable(final_method): + output = final_method(**kwargs) + else: + output = final_method - logger.debug(f"trying to {method} for the {i}{ordinal} time") - if i < max: - time.sleep(wait_time) - i += 1 - return self.attempt_gsheet_method(method, i, max, wait_time, **kwargs) + except (APIError, HTTPError, ReadTimeout, ConnectionError) as e: + # Lets get the ordinals right, because why not + if i % 10 == 1: + ordinal = "st" + elif i % 10 == 2: + ordinal = "nd" + else: + ordinal = "th" - else: - raise e + logger.debug(f"trying to {method} for the {i}{ordinal} time") + if i < max: + time.sleep(wait_time) + return inner_attempt_gsheet_method(method, i + 1, max, wait_time, **kwargs) + + else: + raise e - return output + inner_attempt_gsheet_method(method, 0, max, wait_time, **kwargs) + return output def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): """ @@ -588,6 +589,11 @@ def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None): # Accumulate and materialize combined.concat(data) temp_files.append(combined.materialize_to_file()) + + if len(temp_files) > 1: + utilities.files.close_temp_file(temp_files[0]) + temp_files.remove(temp_files[0]) + return combined def read_sheet(self, spreadsheet_id, sheet_index=0):