From 9a6de94ff465fb21c11afe8894e9c6b884f0d3f6 Mon Sep 17 00:00:00 2001 From: crusopaul Date: Sun, 22 Feb 2026 11:29:23 -0500 Subject: [PATCH 1/8] Update etl.py --- parsons/etl/etl.py | 280 +++++++++++++++++++++++++-------------------- 1 file changed, 158 insertions(+), 122 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index e63bf0cb55..4ab58db1a7 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -1,5 +1,5 @@ import logging -from typing import Literal +from typing import Any, Literal, Self import petl @@ -8,55 +8,62 @@ class ETL: def __init__(self): - pass + self.table = petl.fromdicts([]) - def head(self, n=5): + def head(self, n: int = 5) -> Self: """ - Return the first n rows of the table + Select the first n rows of the table, dropping other records. `Args:` n: int - The number of rows to return. Defaults to 5. + The number of rows to select. Defaults to 5. `Returns:` - `Parsons Table` + ETL: The modified ETL (self). + """ self.table = petl.head(self.table, n) - return self - def tail(self, n=5): + def tail(self, n: int = 5): """ - Return the last n rows of the table. Defaults to 5. + Select the last n rows of the table, dropping other records. `Args:` n: int - The number of rows to return - + The number of rows to select. Defaults to 5. `Returns:` - `Parsons Table` + ETL: The modified ETL (self). + """ self.table = petl.tail(self.table, n) - return self - def add_column(self, column, value=None, index=None, if_exists="fail"): + def add_column( + self, + column: str, + value: Any = None, + index: int = None, + if_exists: Literal["fail", "replace"] = "fail", + ) -> Self: """ - Add a column to your table + Add a column to your table. `Args:` column: str - Name of column to add - value: - A fixed or calculated value + Name of column to add. + value: Any + A fixed or calculated value. index: int - The position of the new column in the table + Optionally, the position of the new column in the table. Default behavior + inserts the record at the end of the table. if_exists: str (options: 'fail', 'replace') If set `replace`, this function will call `fill_column` - if the column already exists, rather than raising a `ValueError` + if the column already exists, rather than raising a `ValueError`. `Returns:` - `Parsons Table` and also updates self + ETL: The modified ETL (self). + """ if column in self.columns: @@ -67,47 +74,46 @@ def add_column(self, column, value=None, index=None, if_exists="fail"): raise ValueError(f"Column {column} already exists") self.table = self.table.addfield(column, value, index) - return self - def remove_column(self, *columns): + def remove_column(self, *columns: str) -> Self: r""" - Remove a column from your table + Remove a column(s) from your table. `Args:` *columns: str - Column names + Column name(s). `Returns:` - `Parsons Table` and also updates self + ETL: The modified ETL (self). + """ self.table = petl.cutout(self.table, *columns) - return self - def rename_column(self, column_name, new_column_name): + def rename_column(self, column_name: str, new_column_name: str) -> Self: """ - Rename a column + Rename a column. `Args:` column_name: str - The current column name + The current column name. new_column_name: str - The new column name + The new column name. `Returns:` - `Parsons Table` and also updates self + ETL: The modified ETL (self). + """ if new_column_name in self.columns: raise ValueError(f"Column {new_column_name} already exists") self.table = petl.rename(self.table, column_name, new_column_name) - return self - def rename_columns(self, column_map): + def rename_columns(self, column_map: dict) -> Self: """ - Rename multiple columns + Rename multiple columns. `Args:` column_map: dict @@ -119,7 +125,8 @@ def rename_columns(self, column_map): 'old_name2': 'new_name2'} `Returns:` - `Parsons Table` and also updates self + ETL: The modified ETL (self). + """ # Check if old column name exists and new column name does not exist @@ -131,20 +138,20 @@ def rename_columns(self, column_map): # Uses the underlying petl method self.table = petl.rename(self.table, column_map) - return self - def fill_column(self, column_name, fill_value): + def fill_column(self, column_name: str, fill_value: Any = None) -> Self: """ - Fill a column in a table + Fill a column in a table. `Args:` column_name: str - The column to fill - fill_value: - A fixed or calculated value + The column to fill. + fill_value: Any + A fixed or calculated value. `Returns:` - `Parsons Table` and also updates self + ETL: The modified ETL (self). + """ if callable(fill_value): @@ -158,15 +165,16 @@ def fill_column(self, column_name, fill_value): def fillna_column(self, column_name, fill_value): """ - Fill None values in a column in a table + Fill None values in a column in a table. `Args:` column_name: str - The column to fill + The column to fill. fill_value: - A fixed or calculated value + A fixed or calculated value. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ if callable(fill_value): @@ -189,15 +197,16 @@ def fillna_column(self, column_name, fill_value): def move_column(self, column, index): """ - Move a column + Move a column. `Args:` column: str - The column name to move + The column name to move. index: - The new index for the column + The new index for the column. `Returns:` `Parsons Table` and also updates existing object. + """ self.table = petl.movefield(self.table, column, index) @@ -212,11 +221,12 @@ def convert_column(self, *column, **kwargs): `Args:` *column: str - A single column or multiple columns passed as a list + A single column or multiple columns passed as a list. **kwargs: str, method or variable - The update function, method, or variable to process the update + The update function, method, or variable to process the update. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ self.table = petl.convert(self.table, *column, **kwargs) @@ -231,7 +241,8 @@ def get_column_max_width(self, column): column: str The column name. `Returns:` - int + int. + """ max_width = 0 @@ -245,10 +256,11 @@ def get_column_max_width(self, column): def convert_columns_to_str(self): """ Convenience function to convert all non-string or mixed columns in a - Parsons table to string (e.g. for comparison) + Parsons table to string (e.g. for comparison.) `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ # If we don't have any rows, don't bother trying to convert things @@ -277,14 +289,15 @@ def coalesce_columns(self, dest_column, source_columns, remove_source_columns=Tr `Args:` dest_column: str - Name of destination column + Name of destination column. source_columns: list - List of source column names + List of source column names. remove_source_columns: bool Whether to remove the source columns after the coalesce. If the destination column is also one of the source columns, it will not be removed. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ if dest_column in self.columns: @@ -322,12 +335,12 @@ def map_columns(self, column_map, exact_match=True): `Args:` column_map: dict - A dictionary of columns and possible values that map to it + A dictionary of columns and possible values that map to it. exact_match: boolean If ``True`` will only map if an exact match. If ``False`` will ignore case, spaces and underscores. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. .. code-block:: python @@ -341,6 +354,7 @@ def map_columns(self, column_map, exact_match=True): tbl.map_columns(column_map) print (tbl) >> {{'first_name': 'Jane', 'last_name': 'Doe', 'date_of_birth': '1908-01-01'}} + """ for col in self.columns: @@ -361,12 +375,13 @@ def map_and_coalesce_columns(self, column_map): destination column name already exists in the table, in which case that value will be preferenced. This method is helpful when your input table might have multiple and unknown column names. + `Args:` column_map: dict - A dictionary of columns and possible values that map to it + A dictionary of columns and possible values that map to it. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. .. code-block:: python @@ -383,6 +398,7 @@ def map_and_coalesce_columns(self, column_map): print (tbl) >> {{'first_name': 'Jane', 'last_name': 'Doe', 'date_of_birth': '1908-01-01'}} + """ for key, value in column_map.items(): @@ -405,28 +421,30 @@ def map_and_coalesce_columns(self, column_map): def get_column_types(self, column): """ - Return all of the Python types for values in a given column + Return all of the Python types for values in a given column. `Args:` column: str - Name of the column to analyze + Name of the column to analyze. `Returns:` list - A list of Python types + A list of Python types. + """ return list(petl.typeset(self.table, column)) def get_columns_type_stats(self): """ - Return descriptive stats for all columns + Return descriptive stats for all columns. `Returns:` list - A list of dicts + A list of dicts. `Returns:` list - A list of dicts, each containing a column 'name' and a 'type' list + A list of dicts, each containing a column 'name' and a 'type' list. + """ return [{"name": col, "type": self.get_column_types(col)} for col in self.table.columns()] @@ -440,9 +458,10 @@ def convert_table(self, *args): `Args:` *args: str, method or variable - The update function, method, or variable to process the update. Can also + The update function, method, or variable to process the update. Can also... `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ self.convert_column(self.columns, *args) @@ -460,26 +479,29 @@ def unpack_dict( prepend_value=None, ): """ - Unpack dictionary values from one column into separate columns + Unpack dictionary values from one column into separate columns. `Args:` column: str - The column name to unpack + The column name to unpack. keys: list The dict keys in the column to unpack. If ``None`` will unpack all. include_original: boolean - Retain original column after unpacking + Retain original column after unpacking. sample_size: int - Number of rows to sample before determining columns + Number of rows to sample before determining columns. missing: str - If a value is missing, the value to fill it with + If a value is missing, the value to fill it with. prepend: Prepend the column name of the unpacked values. Useful for - avoiding duplicate column names + avoiding duplicate column names. prepend_value: Value to prepend new columns if ``prepend=True``. If None, will set to column name. + `Returns:` + `Parsons Table` and also updates self. + """ if prepend: @@ -532,19 +554,20 @@ def unpack_list( `Args:` column: str - The column name to unpack + The column name to unpack. include_original: boolean - Retain original column after unpacking + Retain original column after unpacking. sample_size: int - Number of rows to sample before determining columns + Number of rows to sample before determining columns. missing: str - If a value is missing, the value to fill it with + If a value is missing, the value to fill it with. replace: boolean - Return new table or replace existing + Return new table or replace existing. max_columns: int - The maximum number of columns to unpack + The maximum number of columns to unpack. `Returns:` - None + `Parsons Table` and also updates self. + """ # Convert all column values to list to avoid unpack errors @@ -589,17 +612,18 @@ def unpack_nested_columns_as_rows(self, column, key="id", expand_original: bool `Args:` column: str - The column name to unpack + The column name to unpack. key: str - The column to use as a key when unpacking. Defaults to `id` + The column to use as a key when unpacking. Defaults to `id`. expand_original: boolean or int - If `True`: Add resulting unpacked rows (with all other columns) to original - If `int`: Add to original unless the max added per key is above the given number - If `False` (default): Return unpacked rows (with `key` column only) as standalone + If `True`: Add resulting unpacked rows (with all other columns) to original. + If `int`: Add to original unless the max added per key is above the given number. + If `False` (default): Return unpacked rows (with `key` column only) as standalone. Removes packed list and dict rows from original either way. `Returns:` - If `expand_original`, original table with packed rows replaced by unpacked rows - Otherwise, standalone table with key column and unpacked values only + If `expand_original`, original table with packed rows replaced by unpacked rows. + Otherwise, standalone table with key column and unpacked values only. + """ if isinstance(expand_original, int) and expand_original is not True: @@ -722,24 +746,25 @@ def long_table( `Args:` key: lst - The columns to retain in the long table (e.g. foreign keys) + The columns to retain in the long table (e.g. foreign keys). column: str - The column name to make long + The column name to make long. key_rename: dict The new name for the foreign key to better identify it. For example, you might want to rename ``id`` to ``person_id``. - Ex. {'KEY_NAME': 'NEW_KEY_NAME'} + Ex. {'KEY_NAME': 'NEW_KEY_NAME'}. retain_original: boolean Retain the original column from the source table. prepend: Prepend the column name of the unpacked values. Useful for - avoiding duplicate column names + avoiding duplicate column names. prepend_value: Value to prepend new columns if ``prepend=True``. If None, will set to column name. `Returns:` Parsons Table - The new long table + The new long table. + """ if type(key) is str: @@ -768,13 +793,14 @@ def long_table( def cut(self, *columns): r""" - Return a table of selection of columns + Return a table of selection of columns. `Args:` *columns: str - Columns in the parsons table + Columns in the parsons table. `Returns:` - A new parsons table containing the selected columnns + A new parsons table containing the selected columnns. + """ from parsons.etl.table import Table @@ -808,9 +834,10 @@ def select_rows(self, *filters): >>> {'foo': 'a', 'bar': 2, 'baz': 88.1} `Args:` - *filters: function or str + *filters: function or str. `Returns:` - A new parsons table containing the selected rows + A new parsons table containing the selected rows. + """ from parsons.etl.table import Table @@ -825,11 +852,12 @@ def remove_null_rows(self, columns, null_value=None): `Args:` column: str or list - The column or columns to analyze + The column or columns to analyze. null_value: int or float or str - The null value + The null value. `Returns:` ``None`` + """ if isinstance(columns, str): columns = [columns] @@ -858,11 +886,12 @@ def stack(self, *tables, missing=None): `Args:` tables: Parsons Table or list - A single table, or a list of tables + A single table, or a list of tables. missing: bool - The value to use when padding missing values + The value to use when padding missing values. `Returns:` - ``None`` + ``None``. + """ if type(tables) not in [list, tuple]: @@ -881,11 +910,12 @@ def concat(self, *tables, missing=None): `Args:` tables: Parsons Table or list - A single table, or a list of tables + A single table, or a list of tables. missing: bool - The value to use when padding missing values + The value to use when padding missing values. `Returns:` - ``None`` + ``None``. + """ if type(tables) not in [list, tuple]: @@ -901,9 +931,10 @@ def chunk(self, rows: int): `Args:` rows: int - The number of rows of each new Parsons table + The number of rows of each new Parsons table. `Returns:` - List of Parsons tables + List of Parsons tables. + """ from parsons.etl import Table @@ -920,7 +951,8 @@ def get_normalized_column_name(column_name): `Returns:` str - Normalized column name + Normalized column name. + """ column_name = column_name.lower().strip() @@ -939,7 +971,7 @@ def match_columns( `Args:` desired_columns: list - Ordered list of desired column names + Ordered list of desired column names. fuzzy_match: bool Whether to normalize column names when matching against the desired column names, removing whitespace and non-alphanumeric characters, and lowercasing everything. @@ -954,7 +986,8 @@ def match_columns( value of None), 'ignore' them, or 'fail' (raising an error). `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ from parsons.etl import Table # Just trying to avoid recursive imports. @@ -1088,7 +1121,7 @@ def reduce_rows(self, columns, reduce_func, headers, presorted=False, **kwargs): reduce_func: fun The function by which to reduce the rows. Should take the 2 arguments, the columns list and the rows list and return a list. - `reducer(columns: list, rows: list) -> list;` + `reducer(columns: list, rows: list) -> list;`. headers: list The list of headers for modified table. The length of `headers` should match the length of the list returned by the reduce @@ -1096,7 +1129,7 @@ def reduce_rows(self, columns, reduce_func, headers, presorted=False, **kwargs): presorted: bool If false, the row will be sorted. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. """ @@ -1122,7 +1155,8 @@ def sort(self, columns=None, reverse=False): reverse: boolean Sort rows in reverse order. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ self.table = petl.sort(self.table, key=columns, reverse=reverse) @@ -1135,9 +1169,10 @@ def set_header(self, new_header): `Args:` new_header: list - List of new header column names + List of new header column names. `Returns:` - `Parsons Table` and also updates self + `Parsons Table` and also updates self. + """ self.table = petl.setheader(self.table, new_header) return self @@ -1177,7 +1212,7 @@ def use_petl(self, petl_method, *args, **kwargs): `Args:` petl_method: str - The ``petl`` function to call + The ``petl`` function to call. update_table: bool If ``True``, updates the ``parsons.Table``. Defaults to ``False``. @@ -1189,7 +1224,8 @@ def use_petl(self, petl_method, *args, **kwargs): **kwargs: Any The keyword arguements to pass to the petl function. `Returns:` - `parsons.Table` or `petl` table + `parsons.Table` or `petl` table. + """ update_table = kwargs.pop("update_table", False) to_petl = kwargs.pop("to_petl", False) @@ -1283,7 +1319,7 @@ def deduplicate(self, keys=None, presorted=False): presorted: bool If false, the row will be sorted. `Returns`: - `Parsons Table` and also updates self + `Parsons Table` and also updates self. """ From d386fb4c4f1b0da2ea74d756f79a7440f0134198 Mon Sep 17 00:00:00 2001 From: crusopaul Date: Sun, 22 Feb 2026 11:32:47 -0500 Subject: [PATCH 2/8] Update etl.py --- parsons/etl/etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index 4ab58db1a7..71d4d87eb0 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -44,7 +44,7 @@ def add_column( self, column: str, value: Any = None, - index: int = None, + index: int | None = None, if_exists: Literal["fail", "replace"] = "fail", ) -> Self: """ From d4ff67046f3ffe7204bac72893e6bbe323ea8a7c Mon Sep 17 00:00:00 2001 From: crusopaul Date: Sun, 22 Feb 2026 13:27:45 -0500 Subject: [PATCH 3/8] Update etl.py --- parsons/etl/etl.py | 82 ++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index 71d4d87eb0..fbf80cb611 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Callable from typing import Any, Literal, Self import petl @@ -140,15 +141,16 @@ def rename_columns(self, column_map: dict) -> Self: self.table = petl.rename(self.table, column_map) return self - def fill_column(self, column_name: str, fill_value: Any = None) -> Self: + def fill_column(self, column_name: str, fill_value: Callable[Any, Any] | Any = None) -> Self: """ Fill a column in a table. `Args:` column_name: str The column to fill. - fill_value: Any - A fixed or calculated value. + fill_value: Callable[Any, Any] | Any + A conversion function taking a single argument and returning the converted + value. Alternatively, a fixed or calculated value (or None). `Returns:` ETL: The modified ETL (self). @@ -163,17 +165,18 @@ def fill_column(self, column_name: str, fill_value: Any = None) -> Self: return self - def fillna_column(self, column_name, fill_value): + def fillna_column(self, column_name: str, fill_value: Callable[Any, Any] | Any) -> Self: """ Fill None values in a column in a table. `Args:` column_name: str The column to fill. - fill_value: - A fixed or calculated value. + fill_value: Callable[Any, Any] | Any + A conversion function taking a single argument and returning the converted + value. Alternatively, a fixed or calculated value. `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). """ @@ -195,25 +198,24 @@ def fillna_column(self, column_name, fill_value): return self - def move_column(self, column, index): + def move_column(self, column: str, index: int) -> Self: """ Move a column. `Args:` column: str The column name to move. - index: + index: int The new index for the column. `Returns:` - `Parsons Table` and also updates existing object. + ETL: The modified ETL (self). """ self.table = petl.movefield(self.table, column, index) - return self - def convert_column(self, *column, **kwargs): + def convert_column(self, *column: str, **kwargs: Callable[Any, Any] | Any) -> Self: """ Transform values under one or more fields via arbitrary functions, method invocations or dictionary translations. This leverages the petl ``convert()`` @@ -222,18 +224,17 @@ def convert_column(self, *column, **kwargs): `Args:` *column: str A single column or multiple columns passed as a list. - **kwargs: str, method or variable + **kwargs: Callable[Any, Any] | Any The update function, method, or variable to process the update. `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). """ self.table = petl.convert(self.table, *column, **kwargs) - return self - def get_column_max_width(self, column): + def get_column_max_width(self, column: str) -> int: """ Return the maximum width of the column. @@ -241,7 +242,7 @@ def get_column_max_width(self, column): column: str The column name. `Returns:` - int. + int: The max width. """ @@ -253,13 +254,13 @@ def get_column_max_width(self, column): return max_width - def convert_columns_to_str(self): + def convert_columns_to_str(self) -> Self: """ Convenience function to convert all non-string or mixed columns in a Parsons table to string (e.g. for comparison.) `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). """ @@ -282,7 +283,9 @@ def str_or_empty(x): return self - def coalesce_columns(self, dest_column, source_columns, remove_source_columns=True): + def coalesce_columns( + self, dest_column: str, source_columns: list, remove_source_columns: bool = True + ) -> Self: """ Coalesces values from one or more source columns into a destination column, by selecting the first non-empty value. If the destination column doesn't exist, it will be added. @@ -293,10 +296,11 @@ def coalesce_columns(self, dest_column, source_columns, remove_source_columns=Tr source_columns: list List of source column names. remove_source_columns: bool - Whether to remove the source columns after the coalesce. If the destination - column is also one of the source columns, it will not be removed. + Optionally, whether to remove the source columns after the coalesce. If the + destination column is also one of the source columns, it will not be removed. + Defaults to True. `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). """ @@ -327,7 +331,7 @@ def add_fn(row): return self - def map_columns(self, column_map, exact_match=True): + def map_columns(self, column_map: dict, exact_match: bool = True) -> Self: """ Standardizes column names based on multiple possible values. This method is helpful when your input table might have multiple and unknown column @@ -336,11 +340,11 @@ def map_columns(self, column_map, exact_match=True): `Args:` column_map: dict A dictionary of columns and possible values that map to it. - exact_match: boolean - If ``True`` will only map if an exact match. If ``False`` will - ignore case, spaces and underscores. + exact_match: bool + Optionally, if ``True`` will only map if an exact match. If ``False`` will + ignore case, spaces and underscores. Defaults to True. `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). .. code-block:: python @@ -367,7 +371,7 @@ def map_columns(self, column_map, exact_match=True): return self - def map_and_coalesce_columns(self, column_map): + def map_and_coalesce_columns(self, column_map: dict) -> Self: """ Coalesces columns based on multiple possible values. The columns in the map do not need to be in your table, so you can create a map with all possibilities. @@ -381,7 +385,7 @@ def map_and_coalesce_columns(self, column_map): A dictionary of columns and possible values that map to it. `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). .. code-block:: python @@ -419,7 +423,7 @@ def map_and_coalesce_columns(self, column_map): return self - def get_column_types(self, column): + def get_column_types(self, column: str) -> list: """ Return all of the Python types for values in a given column. @@ -427,29 +431,24 @@ def get_column_types(self, column): column: str Name of the column to analyze. `Returns:` - list - A list of Python types. + list: A list of Python types. """ return list(petl.typeset(self.table, column)) - def get_columns_type_stats(self): + def get_columns_type_stats(self) -> list: """ Return descriptive stats for all columns. `Returns:` - list - A list of dicts. - `Returns:` - list - A list of dicts, each containing a column 'name' and a 'type' list. + list: A list of dicts, each containing a column 'name' and a 'type' list. """ return [{"name": col, "type": self.get_column_types(col)} for col in self.table.columns()] - def convert_table(self, *args): + def convert_table(self, *args: Callable[Any, Any] | Any) -> Self: r""" Transform all cells in a table via arbitrary functions, method invocations or dictionary translations. This method is useful for cleaning fields and data hygiene functions such @@ -457,7 +456,7 @@ def convert_table(self, *args): found `here` `_. `Args:` - *args: str, method or variable + *args: Callable[Any, Any] The update function, method, or variable to process the update. Can also... `Returns:` `Parsons Table` and also updates self. @@ -465,7 +464,6 @@ def convert_table(self, *args): """ self.convert_column(self.columns, *args) - return self def unpack_dict( From d325f102cf1bcabfa187e0e6f06a3e73c540e7e6 Mon Sep 17 00:00:00 2001 From: crusopaul Date: Mon, 23 Feb 2026 05:38:44 -0500 Subject: [PATCH 4/8] Update etl.py --- parsons/etl/etl.py | 161 ++++++++++++++++++++++----------------------- 1 file changed, 80 insertions(+), 81 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index fbf80cb611..8437f6e818 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -1,9 +1,15 @@ +from __future__ import annotations + import logging -from collections.abc import Callable -from typing import Any, Literal, Self +from typing import TYPE_CHECKING, Any, Literal + +if TYPE_CHECKING: + from collections.abc import Callable import petl +import parsons + logger = logging.getLogger(__name__) @@ -11,7 +17,7 @@ class ETL: def __init__(self): self.table = petl.fromdicts([]) - def head(self, n: int = 5) -> Self: + def head(self, n: int = 5) -> ETL: """ Select the first n rows of the table, dropping other records. @@ -26,7 +32,7 @@ def head(self, n: int = 5) -> Self: self.table = petl.head(self.table, n) return self - def tail(self, n: int = 5): + def tail(self, n: int = 5) -> ETL: """ Select the last n rows of the table, dropping other records. @@ -47,7 +53,7 @@ def add_column( value: Any = None, index: int | None = None, if_exists: Literal["fail", "replace"] = "fail", - ) -> Self: + ) -> ETL: """ Add a column to your table. @@ -77,7 +83,7 @@ def add_column( self.table = self.table.addfield(column, value, index) return self - def remove_column(self, *columns: str) -> Self: + def remove_column(self, *columns: str) -> ETL: r""" Remove a column(s) from your table. @@ -92,7 +98,7 @@ def remove_column(self, *columns: str) -> Self: self.table = petl.cutout(self.table, *columns) return self - def rename_column(self, column_name: str, new_column_name: str) -> Self: + def rename_column(self, column_name: str, new_column_name: str) -> ETL: """ Rename a column. @@ -112,7 +118,7 @@ def rename_column(self, column_name: str, new_column_name: str) -> Self: self.table = petl.rename(self.table, column_name, new_column_name) return self - def rename_columns(self, column_map: dict) -> Self: + def rename_columns(self, column_map: dict) -> ETL: """ Rename multiple columns. @@ -141,16 +147,16 @@ def rename_columns(self, column_map: dict) -> Self: self.table = petl.rename(self.table, column_map) return self - def fill_column(self, column_name: str, fill_value: Callable[Any, Any] | Any = None) -> Self: + def fill_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) -> ETL: """ Fill a column in a table. `Args:` column_name: str The column to fill. - fill_value: Callable[Any, Any] | Any + fill_value: Callable[[Any], Any] | Any A conversion function taking a single argument and returning the converted - value. Alternatively, a fixed or calculated value (or None). + value. Alternatively, a fixed or calculated value. `Returns:` ETL: The modified ETL (self). @@ -165,14 +171,14 @@ def fill_column(self, column_name: str, fill_value: Callable[Any, Any] | Any = N return self - def fillna_column(self, column_name: str, fill_value: Callable[Any, Any] | Any) -> Self: + def fillna_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) -> ETL: """ Fill None values in a column in a table. `Args:` column_name: str The column to fill. - fill_value: Callable[Any, Any] | Any + fill_value: Callable[[Any], Any] | Any A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. `Returns:` @@ -198,7 +204,7 @@ def fillna_column(self, column_name: str, fill_value: Callable[Any, Any] | Any) return self - def move_column(self, column: str, index: int) -> Self: + def move_column(self, column: str, index: int) -> ETL: """ Move a column. @@ -215,7 +221,7 @@ def move_column(self, column: str, index: int) -> Self: self.table = petl.movefield(self.table, column, index) return self - def convert_column(self, *column: str, **kwargs: Callable[Any, Any] | Any) -> Self: + def convert_column(self, *column: str, **kwargs: Callable[[Any], Any] | Any) -> ETL: """ Transform values under one or more fields via arbitrary functions, method invocations or dictionary translations. This leverages the petl ``convert()`` @@ -224,7 +230,7 @@ def convert_column(self, *column: str, **kwargs: Callable[Any, Any] | Any) -> Se `Args:` *column: str A single column or multiple columns passed as a list. - **kwargs: Callable[Any, Any] | Any + **kwargs: Callable[[Any], Any] | Any The update function, method, or variable to process the update. `Returns:` ETL: The modified ETL (self). @@ -254,7 +260,7 @@ def get_column_max_width(self, column: str) -> int: return max_width - def convert_columns_to_str(self) -> Self: + def convert_columns_to_str(self) -> ETL: """ Convenience function to convert all non-string or mixed columns in a Parsons table to string (e.g. for comparison.) @@ -285,7 +291,7 @@ def str_or_empty(x): def coalesce_columns( self, dest_column: str, source_columns: list, remove_source_columns: bool = True - ) -> Self: + ) -> ETL: """ Coalesces values from one or more source columns into a destination column, by selecting the first non-empty value. If the destination column doesn't exist, it will be added. @@ -331,7 +337,7 @@ def add_fn(row): return self - def map_columns(self, column_map: dict, exact_match: bool = True) -> Self: + def map_columns(self, column_map: dict, exact_match: bool = True) -> ETL: """ Standardizes column names based on multiple possible values. This method is helpful when your input table might have multiple and unknown column @@ -371,7 +377,7 @@ def map_columns(self, column_map: dict, exact_match: bool = True) -> Self: return self - def map_and_coalesce_columns(self, column_map: dict) -> Self: + def map_and_coalesce_columns(self, column_map: dict) -> ETL: """ Coalesces columns based on multiple possible values. The columns in the map do not need to be in your table, so you can create a map with all possibilities. @@ -448,7 +454,7 @@ def get_columns_type_stats(self) -> list: return [{"name": col, "type": self.get_column_types(col)} for col in self.table.columns()] - def convert_table(self, *args: Callable[Any, Any] | Any) -> Self: + def convert_table(self, *args: Callable[[Any], Any] | Any) -> ETL: r""" Transform all cells in a table via arbitrary functions, method invocations or dictionary translations. This method is useful for cleaning fields and data hygiene functions such @@ -456,10 +462,10 @@ def convert_table(self, *args: Callable[Any, Any] | Any) -> Self: found `here` `_. `Args:` - *args: Callable[Any, Any] + *args: Callable[[Any], Any] The update function, method, or variable to process the update. Can also... `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). """ @@ -468,14 +474,14 @@ def convert_table(self, *args: Callable[Any, Any] | Any) -> Self: def unpack_dict( self, - column, - keys=None, - include_original=False, - sample_size=5000, + column: str, + keys: list | None = None, + include_original: bool = False, + sample_size: int = 5000, missing=None, - prepend=True, - prepend_value=None, - ): + prepend: bool = True, + prepend_value: str | None = None, + ) -> ETL: """ Unpack dictionary values from one column into separate columns. @@ -485,20 +491,20 @@ def unpack_dict( keys: list The dict keys in the column to unpack. If ``None`` will unpack all. - include_original: boolean + include_original: bool Retain original column after unpacking. sample_size: int Number of rows to sample before determining columns. missing: str If a value is missing, the value to fill it with. - prepend: + prepend: bool Prepend the column name of the unpacked values. Useful for avoiding duplicate column names. - prepend_value: + prepend_value: str Value to prepend new columns if ``prepend=True``. If None, will set to column name. `Returns:` - `Parsons Table` and also updates self. + ETL: The modified ETL (self). """ @@ -523,12 +529,12 @@ def unpack_dict( def unpack_list( self, - column, - include_original=False, - missing=None, - replace=False, - max_columns=None, - ): + column: str, + include_original: bool = False, + missing: str | None = None, + replace: bool = False, + max_columns: int | None = None, + ) -> petl.util.base.Table: """ Unpack list values from one column into separate columns. Numbers the columns. @@ -553,18 +559,18 @@ def unpack_list( `Args:` column: str The column name to unpack. - include_original: boolean - Retain original column after unpacking. + include_original: bool + Retain original column after unpacking. Defaults to False. sample_size: int Number of rows to sample before determining columns. missing: str - If a value is missing, the value to fill it with. - replace: boolean - Return new table or replace existing. + Optionally, a default value to use when values are missing. + replace: bool + Return new table or replace existing. Defaults to False. max_columns: int - The maximum number of columns to unpack. + Optionally, the maximum number of columns to unpack. `Returns:` - `Parsons Table` and also updates self. + petl.util.base.Table: The modified table. """ @@ -602,7 +608,9 @@ def unpack_list( else: return tbl - def unpack_nested_columns_as_rows(self, column, key="id", expand_original: bool | int = False): + def unpack_nested_columns_as_rows( + self, column: str, key: str = "id", expand_original: bool | int = False + ) -> petl.util.base.Table: """ Unpack list or dict values from one column into separate rows. Not recommended for JSON columns (i.e. lists of dicts), but can handle columns @@ -613,14 +621,14 @@ def unpack_nested_columns_as_rows(self, column, key="id", expand_original: bool The column name to unpack. key: str The column to use as a key when unpacking. Defaults to `id`. - expand_original: boolean or int + expand_original: bool | int If `True`: Add resulting unpacked rows (with all other columns) to original. If `int`: Add to original unless the max added per key is above the given number. If `False` (default): Return unpacked rows (with `key` column only) as standalone. Removes packed list and dict rows from original either way. `Returns:` - If `expand_original`, original table with packed rows replaced by unpacked rows. - Otherwise, standalone table with key column and unpacked values only. + petl.util.base.Table:: If `expand_original`, original table with packed rows replaced + by unpacked rows. Otherwise, standalone table with key column and unpacked values only. """ @@ -655,11 +663,9 @@ def unpack_nested_columns_as_rows(self, column, key="id", expand_original: bool table_dict = table.select_rows(lambda row: isinstance(row[column], dict)) table_dict.unpack_dict(column, prepend=False) - from parsons.etl.table import Table - # Use melt to pivot both sets of columns into their own Tables and clean out None values - melted_list = Table(petl.melt(table_list.table, ignore_cols)) - melted_dict = Table(petl.melt(table_dict.table, ignore_cols)) + melted_list = parsons.etl.table.Table(petl.melt(table_list.table, ignore_cols)) + melted_dict = parsons.etl.table.Table(petl.melt(table_dict.table, ignore_cols)) melted_list.remove_null_rows("value") melted_dict.remove_null_rows("value") @@ -709,13 +715,13 @@ def unpack_nested_columns_as_rows(self, column, key="id", expand_original: bool def long_table( self, - key, - column, - key_rename=None, - retain_original=False, - prepend=True, - prepend_value=None, - ): + key: list, + column: str, + key_rename: dict | None = None, + retain_original: bool = False, + prepend: bool = True, + prepend_value: str = None, + ) -> parsons.etl.table.Table: """ Create a new long parsons table from a column, including the foreign key. @@ -743,7 +749,7 @@ def long_table( >>> {'id': '5421', 'emails_home': None, 'emails_work': 'jane@mywork.com'} `Args:` - key: lst + key: list The columns to retain in the long table (e.g. foreign keys). column: str The column name to make long. @@ -751,17 +757,16 @@ def long_table( The new name for the foreign key to better identify it. For example, you might want to rename ``id`` to ``person_id``. Ex. {'KEY_NAME': 'NEW_KEY_NAME'}. - retain_original: boolean - Retain the original column from the source table. - prepend: + retain_original: bool + Retain the original column from the source table. Defaults to False. + prepend: bool Prepend the column name of the unpacked values. Useful for - avoiding duplicate column names. - prepend_value: + avoiding duplicate column names. Defaults to True. + prepend_value: str Value to prepend new columns if ``prepend=True``. If None, will set to column name. `Returns:` - Parsons Table - The new long table. + parsons.etl.table.Table: The modified Parsons Table. """ @@ -789,7 +794,7 @@ def long_table( return lt - def cut(self, *columns): + def cut(self, *columns: str) -> parsons.etl.table.Table: r""" Return a table of selection of columns. @@ -797,13 +802,11 @@ def cut(self, *columns): *columns: str Columns in the parsons table. `Returns:` - A new parsons table containing the selected columnns. + parsons.etl.table.Table: The modified Parsons Table. """ - from parsons.etl.table import Table - - return Table(petl.cut(self.table, *columns)) + return parsons.etl.table.Table(petl.cut(self.table, *columns)) def select_rows(self, *filters): r""" @@ -838,9 +841,7 @@ def select_rows(self, *filters): """ - from parsons.etl.table import Table - - return Table(petl.select(self.table, *filters)) + return parsons.etl.table.Table(petl.select(self.table, *filters)) def remove_null_rows(self, columns, null_value=None): """ @@ -1234,9 +1235,7 @@ def use_petl(self, petl_method, *args, **kwargs): if to_petl: return getattr(petl, petl_method)(self.table, *args, **kwargs) - from parsons.etl.table import Table - - return Table(getattr(petl, petl_method)(self.table, *args, **kwargs)) + return parsons.etl.table.Table(getattr(petl, petl_method)(self.table, *args, **kwargs)) def deduplicate(self, keys=None, presorted=False): """ From 2a29cd80cbbbe9028c66872939a16762ee485fb0 Mon Sep 17 00:00:00 2001 From: crusopaul Date: Tue, 3 Mar 2026 15:38:45 -0500 Subject: [PATCH 5/8] Update etl.py --- parsons/etl/etl.py | 94 +++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 51 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index 33834122ca..ef8418ca0b 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from collections.abc import Callable @@ -52,7 +52,7 @@ def tail(self, n: int = 5) -> ETL: def add_column( self, column: str, - value: Any = None, + value: object = None, index: int | None = None, if_exists: Literal["fail", "replace"] = "fail", ) -> ETL: @@ -62,7 +62,7 @@ def add_column( Args: column: str Name of column to add. - value: Any + value: object A fixed or calculated value. index: int Optionally, the position of the new column in the table. Default behavior @@ -152,14 +152,14 @@ def rename_columns(self, column_map: dict) -> ETL: self.table = petl.rename(self.table, column_map) return self - def fill_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) -> ETL: + def fill_column(self, column_name: str, fill_value: Callable[[object], object] | object) -> ETL: """ Fill a column in a table. Args: column_name: str The column to fill. - fill_value: Callable[[Any], Any] | Any + fill_value: Callable[[object], object] | object A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. @@ -177,14 +177,16 @@ def fill_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) return self - def fillna_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) -> ETL: + def fillna_column( + self, column_name: str, fill_value: Callable[[object], object] | object + ) -> ETL: """ Fill None values in a column in a table. Args: column_name: str The column to fill. - fill_value: Callable[[Any], Any] | Any + fill_value: Callable[[object], object] | object A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. @@ -229,7 +231,7 @@ def move_column(self, column: str, index: int) -> ETL: self.table = petl.movefield(self.table, column, index) return self - def convert_column(self, *column: str, **kwargs: Callable[[Any], Any] | Any) -> ETL: + def convert_column(self, *column: str, **kwargs: Callable[[object], object] | object) -> ETL: """ Transform values under one or more fields via arbitrary functions, method invocations or dictionary translations. This leverages the petl convert() @@ -238,7 +240,7 @@ def convert_column(self, *column: str, **kwargs: Callable[[Any], Any] | Any) -> Args: *column: str A single column or multiple columns passed as a list. - **kwargs: Callable[[Any], Any] | Any + **kwargs: Callable[[object], object] | object The update function, method, or variable to process the update. Returns: @@ -286,21 +288,16 @@ def convert_columns_to_str(self) -> ETL: cols = self.get_columns_type_stats() - def str_or_empty(x): - if x is None: - return "" - return str(x) - for col in cols: # If there's more than one type (or no types), convert to str # Also if there is one type and it's not str, convert to str if len(col["type"]) != 1 or col["type"][0] != "str": - self.convert_column(col["name"], str_or_empty) + self.convert_column(col["name"], lambda x: "" if x is None else str(x)) return self def coalesce_columns( - self, dest_column: str, source_columns: list, remove_source_columns: bool = True + self, dest_column: str, source_columns: list[str], remove_source_columns: bool = True ) -> ETL: """ Coalesces values from one or more source columns into a destination column, by selecting @@ -309,7 +306,7 @@ def coalesce_columns( Args: dest_column: str Name of destination column. - source_columns: list + source_columns: list[str] List of source column names. remove_source_columns: bool Optionally, whether to remove the source columns after the coalesce. If the @@ -323,7 +320,7 @@ def coalesce_columns( if dest_column in self.columns: - def convert_fn(value, row): + def convert_fn(value: object, row: object) -> object: for source_col in source_columns: if row.get(source_col): return row[source_col] @@ -333,7 +330,7 @@ def convert_fn(value, row): else: - def add_fn(row): + def add_fn(row: object) -> object: for source_col in source_columns: if row.get(source_col): return row[source_col] @@ -473,7 +470,7 @@ def get_columns_type_stats(self) -> list: return [{"name": col, "type": self.get_column_types(col)} for col in self.table.columns()] - def convert_table(self, *args: Callable[[Any], Any] | Any) -> ETL: + def convert_table(self, *args: Callable[[object], object] | object) -> ETL: r""" Transform all cells in a table via arbitrary functions, method invocations or dictionary translations. This method is useful for cleaning fields and data hygiene functions such @@ -495,10 +492,10 @@ def convert_table(self, *args: Callable[[Any], Any] | Any) -> ETL: def unpack_dict( self, column: str, - keys: list | None = None, + keys: list[str] | None = None, include_original: bool = False, sample_size: int = 5000, - missing=None, + missing: object = None, prepend: bool = True, prepend_value: str | None = None, ) -> ETL: @@ -508,14 +505,14 @@ def unpack_dict( Args: column: str The column name to unpack. - keys: list + keys: list[str] The dict keys in the column to unpack. If None will unpack all. include_original: bool Retain original column after unpacking. sample_size: int Number of rows to sample before determining columns. - missing: str + missing: object If a value is missing, the value to fill it with. prepend: bool Prepend the column name of the unpacked values. Useful for @@ -628,7 +625,6 @@ def unpack_list( if replace: self.table = tbl - else: return tbl @@ -740,12 +736,12 @@ def unpack_nested_columns_as_rows( def long_table( self, - key: list, + key: list[str], column: str, key_rename: dict | None = None, retain_original: bool = False, prepend: bool = True, - prepend_value: str = None, + prepend_value: str | None = None, ) -> parsons.etl.table.Table: """ Create a new long parsons table from a column, including the foreign @@ -777,7 +773,7 @@ def long_table( >>> {'id': '5421', 'emails_home': None, 'emails_work': 'jane@mywork.com'} Args: - key: list + key: list[str] The columns to retain in the long table (e.g. foreign keys). column: str The column name to make long. @@ -838,7 +834,7 @@ def cut(self, *columns: str) -> parsons.etl.table.Table: return parsons.etl.table.Table(petl.cut(self.table, *columns)) - def select_rows(self, *filters): + def select_rows(self, *filters: Callable[[object], bool]) -> parsons.etl.table.Table: r""" Select specific rows from a Parsons table based on the passed filters. @@ -872,27 +868,24 @@ def select_rows(self, *filters): *filters: function or str. Returns: - A new parsons table containing the selected rows. + parsons.etl.table.Table: A new parsons table containing the selected rows. """ return parsons.etl.table.Table(petl.select(self.table, *filters)) - def remove_null_rows(self, columns, null_value=None): + def remove_null_rows(self, columns: list[str] | str, null_value: object = None) -> None: """ Remove rows if the values in a column are None. If multiple columns are passed as list, it will remove all rows with null values in any of the passed columns. Args: - column: str or list + column: list[str] | str The column or columns to analyze. - null_value: int or float or str + null_value: object The null value. - Returns: - None - """ if isinstance(columns, str): columns = [columns] @@ -902,7 +895,7 @@ def remove_null_rows(self, columns, null_value=None): return self - def _prepend_dict(self, dict_obj, prepend): + def _prepend_dict(self, dict_obj: dict, prepend: object) -> dict: # Internal method to rename dict keys new_dict = {} @@ -912,7 +905,7 @@ def _prepend_dict(self, dict_obj, prepend): return new_dict - def stack(self, *tables, missing=None): + def stack(self, *tables, missing=None) -> None: """ Stack Parsons tables on top of one another. @@ -925,9 +918,6 @@ def stack(self, *tables, missing=None): missing: bool The value to use when padding missing values. - Returns: - None. - """ if type(tables) not in [list, tuple]: @@ -1001,7 +991,7 @@ def get_normalized_column_name(column_name: str) -> str: def match_columns( self, - desired_columns, + desired_columns: list[str], fuzzy_match=True, if_extra_columns: Literal["remove", "ignore", "fail"] = "remove", if_missing_columns: Literal["add", "ignore", "fail"] = "add", @@ -1011,7 +1001,7 @@ def match_columns( names. Args: - desired_columns: list + desired_columns: list[str] Ordered list of desired column names. fuzzy_match: bool Whether to normalize column names when matching against the desired column names, @@ -1106,7 +1096,9 @@ def match_columns( return self - def reduce_rows(self, columns, reduce_func, headers, presorted=False, **kwargs): + def reduce_rows( + self, columns: list[str], reduce_func, headers: list[str], presorted=False, **kwargs + ): """ Group rows by a column or columns, then reduce the groups to a single row. @@ -1163,13 +1155,13 @@ def reduce_rows(self, columns, reduce_func, headers, presorted=False, **kwargs): +-------------------------+-----------------------------------------------------------------------+ Args: - columns (list): + columns: list[str] The column(s) by which to group the rows. reduce_func: fun The function by which to reduce the rows. Should take the 2 arguments, the columns list and the rows list and return a list. - `reducer(columns: list, rows: list) -> list;`. - headers: list + `reducer(columns: list[str], rows: list[object]) -> list[object];`. + headers: list[str] The list of headers for modified table. The length should match the length of the list returned by the reduce function. presorted: bool @@ -1191,12 +1183,12 @@ def reduce_rows(self, columns, reduce_func, headers, presorted=False, **kwargs): return self - def sort(self, columns=None, reverse=False): + def sort(self, columns: list[str] | str | None = None, reverse=False): """ Sort the rows a table. Args: - sort_columns: list or str + columns: list[str] | str Sort by a single column or a list of column. If None then will sort columns from left to right. reverse: boolean @@ -1211,12 +1203,12 @@ def sort(self, columns=None, reverse=False): return self - def set_header(self, new_header): + def set_header(self, new_header: list[str]): """ Replace the header row of the table. Args: - new_header: list + new_header: list[str] List of new header column names. Returns: From 6c8d93ad22f0781295f13f71510982dd9c79755b Mon Sep 17 00:00:00 2001 From: crusopaul Date: Tue, 3 Mar 2026 18:05:40 -0500 Subject: [PATCH 6/8] no more *args & **kwargs types --- parsons/etl/etl.py | 282 ++++++++++++++++++++++++--------------------- 1 file changed, 153 insertions(+), 129 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index ef8418ca0b..aa3b31ab17 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -1,15 +1,13 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Literal, TypeVar if TYPE_CHECKING: from collections.abc import Callable import petl -import parsons - logger = logging.getLogger(__name__) @@ -25,8 +23,8 @@ def head(self, n: int = 5) -> ETL: n: int The number of rows to return. Defaults to 5. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -41,8 +39,8 @@ def tail(self, n: int = 5) -> ETL: n: int The number of rows to select. Defaults to 5. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -71,8 +69,8 @@ def add_column( If set "replace", this function will call fill_column. If the column already exists, rather than raising a ValueError. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -86,16 +84,16 @@ def add_column( self.table = self.table.addfield(column, value, index) return self - def remove_column(self, *columns: str) -> ETL: - r""" + def remove_column(self, *columns) -> ETL: + """ Remove a column(s) from your table. Args: - *columns: str + columns: Column name(s). - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -112,8 +110,8 @@ def rename_column(self, column_name: str, new_column_name: str) -> ETL: new_column_name: str The new column name. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -136,8 +134,8 @@ def rename_columns(self, column_map: dict) -> ETL: {'old_name': 'new_name', 'old_name2': 'new_name2'} - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -163,8 +161,8 @@ def fill_column(self, column_name: str, fill_value: Callable[[object], object] | A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -190,8 +188,8 @@ def fillna_column( A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -223,28 +221,28 @@ def move_column(self, column: str, index: int) -> ETL: index: int The new index for the column. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ self.table = petl.movefield(self.table, column, index) return self - def convert_column(self, *column: str, **kwargs: Callable[[object], object] | object) -> ETL: + def convert_column(self, *column, **kwargs) -> ETL: """ Transform values under one or more fields via arbitrary functions, method invocations or dictionary translations. This leverages the petl convert() method. Example usage can be found `here `_. Args: - *column: str + column: A single column or multiple columns passed as a list. - **kwargs: Callable[[object], object] | object + kwargs: The update function, method, or variable to process the update. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -259,8 +257,8 @@ def get_column_max_width(self, column: str) -> int: column: str The column name. - Returns: - int: The max width. + Returns: int + The max width. """ @@ -277,8 +275,8 @@ def convert_columns_to_str(self) -> ETL: Convenience function to convert all non-string or mixed columns in a Parsons table to string (e.g. for comparison.) - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -288,11 +286,14 @@ def convert_columns_to_str(self) -> ETL: cols = self.get_columns_type_stats() + def str_or_empty(x: str | None) -> str: + return "" if x is None else str(x) + for col in cols: # If there's more than one type (or no types), convert to str # Also if there is one type and it's not str, convert to str if len(col["type"]) != 1 or col["type"][0] != "str": - self.convert_column(col["name"], lambda x: "" if x is None else str(x)) + self.convert_column(col["name"], str_or_empty) return self @@ -313,14 +314,14 @@ def coalesce_columns( destination column is also one of the source columns, it will not be removed. Defaults to True. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ if dest_column in self.columns: - def convert_fn(value: object, row: object) -> object: + def convert_fn(value: object, row: dict) -> object: for source_col in source_columns: if row.get(source_col): return row[source_col] @@ -330,7 +331,7 @@ def convert_fn(value: object, row: object) -> object: else: - def add_fn(row: object) -> object: + def add_fn(row: dict) -> object: for source_col in source_columns: if row.get(source_col): return row[source_col] @@ -376,8 +377,8 @@ def map_columns(self, column_map: dict, exact_match: bool = True) -> ETL: Optionally, if True will only map if an exact match. If False will ignore case, spaces and underscores. Defaults to True. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -424,8 +425,8 @@ def map_and_coalesce_columns(self, column_map: dict) -> ETL: column_map: dict A dictionary of columns and possible values that map to it. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -444,7 +445,7 @@ def map_and_coalesce_columns(self, column_map: dict) -> ETL: return self - def get_column_types(self, column: str) -> list: + def get_column_types(self, column: str) -> list[TypeVar]: """ Return all of the Python types for values in a given column. @@ -452,37 +453,37 @@ def get_column_types(self, column: str) -> list: column: str Name of the column to analyze. - Returns: - list: A list of Python types. + Returns: list[TypeVar] + A list of Python types. """ return list(petl.typeset(self.table, column)) - def get_columns_type_stats(self) -> list: + def get_columns_type_stats(self) -> list[dict]: """ Return descriptive stats for all columns. - Returns: - list: A list of dicts, each containing a column 'name' and a 'type' list. + Returns: list[dict] + A list of dicts, each containing a column 'name' and a 'type' list. """ return [{"name": col, "type": self.get_column_types(col)} for col in self.table.columns()] - def convert_table(self, *args: Callable[[object], object] | object) -> ETL: - r""" + def convert_table(self, *args) -> ETL: + """ Transform all cells in a table via arbitrary functions, method invocations or dictionary translations. This method is useful for cleaning fields and data hygiene functions such as regex. This method leverages the petl convert() method. Example usage can be found `here` `_. Args: - *args: Callable[[Any], Any] + args: The update function, method, or variable to process the update. Can also... - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -521,8 +522,8 @@ def unpack_dict( Value to prepend new columns if prepend=True. If None, will set to column name. - Returns: - ETL: The modified ETL (self). + Returns: ETL + The modified ETL (self). """ @@ -590,8 +591,8 @@ def unpack_list( max_columns: int Optionally, the maximum number of columns to unpack. - Returns: - petl.util.base.Table: The modified table. + Returns: petl.util.base.Table + The new table or returns None if table is being replaced. """ @@ -630,7 +631,7 @@ def unpack_list( def unpack_nested_columns_as_rows( self, column: str, key: str = "id", expand_original: bool | int = False - ) -> petl.util.base.Table: + ) -> ETL: """ Unpack list or dict values from one column into separate rows. Not recommended for JSON columns (i.e. lists of dicts), but can handle columns @@ -647,12 +648,14 @@ def unpack_nested_columns_as_rows( If False (default): Return unpacked rows (with key column only) as standalone. Removes packed list and dict rows from original either way. - Returns: - petl.util.base.Table: If expand_original, original table with packed rows replaced + Returns: parsons.etl.table.Table + If expand_original, original table with packed rows replaced by unpacked rows. Otherwise, standalone table with key column and unpacked values only. """ + from parsons.etl import Table + if isinstance(expand_original, int) and expand_original is not True: lengths = {len(row[column]) for row in self if isinstance(row[column], (dict, list))} max_len = sorted(lengths, reverse=True)[0] @@ -661,12 +664,18 @@ def unpack_nested_columns_as_rows( if expand_original: # Include all columns and filter out other non-dict types in table_list + def select_lists(row: dict) -> bool: + return isinstance(row[column], list) + table = self - table_list = table.select_rows(lambda row: isinstance(row[column], list)) + table_list = table.select_rows(select_lists) else: # Otherwise, include only key and column, but keep all non-dict types in table_list + def select_non_dicts(row: dict) -> bool: + return not isinstance(row[column], dict) + table = self.cut(key, column) - table_list = table.select_rows(lambda row: not isinstance(row[column], dict)) + table_list = table.select_rows(select_non_dicts) # All the columns other than column to ignore while melting ignore_cols = table.columns @@ -685,8 +694,8 @@ def unpack_nested_columns_as_rows( table_dict.unpack_dict(column, prepend=False) # Use melt to pivot both sets of columns into their own Tables and clean out None values - melted_list = parsons.etl.table.Table(petl.melt(table_list.table, ignore_cols)) - melted_dict = parsons.etl.table.Table(petl.melt(table_dict.table, ignore_cols)) + melted_list = Table(petl.melt(table_list.table, ignore_cols)) + melted_dict = Table(petl.melt(table_dict.table, ignore_cols)) melted_list.remove_null_rows("value") melted_dict.remove_null_rows("value") @@ -720,7 +729,9 @@ def unpack_nested_columns_as_rows( orig.move_column(column, -1) output = orig else: - orig = self.remove_column(column) + from parsons.etl.table import Table + + orig = Table(self.remove_column(column).table) # Add unique id column by hashing all the other fields melted_list.add_column( "uid", @@ -742,7 +753,7 @@ def long_table( retain_original: bool = False, prepend: bool = True, prepend_value: str | None = None, - ) -> parsons.etl.table.Table: + ) -> ETL: """ Create a new long parsons table from a column, including the foreign key. @@ -790,8 +801,8 @@ def long_table( Value to prepend new columns if prepend=True. If None, will set to column name. - Returns: - parsons.etl.table.Table: The modified Parsons Table. + Returns: parsons.etl.table.Table + The Parsons Table. """ @@ -819,23 +830,25 @@ def long_table( return lt - def cut(self, *columns: str) -> parsons.etl.table.Table: - r""" + def cut(self, *columns) -> ETL: + """ Return a table of selection of columns. Args: - *columns: str + columns Columns in the parsons table. - Returns: - parsons.etl.table.Table: The modified Parsons Table. + Returns: parsons.etl.table.Table + The Parsons Table. """ - return parsons.etl.table.Table(petl.cut(self.table, *columns)) + from parsons.etl import Table + + return Table(petl.cut(self.table, *columns)) - def select_rows(self, *filters: Callable[[object], bool]) -> parsons.etl.table.Table: - r""" + def select_rows(self, *filters) -> ETL: + """ Select specific rows from a Parsons table based on the passed filters. @@ -865,16 +878,19 @@ def select_rows(self, *filters: Callable[[object], bool]) -> parsons.etl.table.T >>> {'foo': 'a', 'bar': 2, 'baz': 88.1} Args: - *filters: function or str. + filters: + Function or str. - Returns: - parsons.etl.table.Table: A new parsons table containing the selected rows. + Returns: parsons.etl.table.Table + A new parsons table containing the selected rows. """ - return parsons.etl.table.Table(petl.select(self.table, *filters)) + from parsons.etl import Table - def remove_null_rows(self, columns: list[str] | str, null_value: object = None) -> None: + return Table(petl.select(self.table, *filters)) + + def remove_null_rows(self, columns: list[str] | str, null_value: object = None) -> ETL: """ Remove rows if the values in a column are None. If multiple columns are passed as list, it will remove all rows with null values in any @@ -886,6 +902,9 @@ def remove_null_rows(self, columns: list[str] | str, null_value: object = None) null_value: object The null value. + Returns: ETL + The modified ETL (self). + """ if isinstance(columns, str): columns = [columns] @@ -895,7 +914,7 @@ def remove_null_rows(self, columns: list[str] | str, null_value: object = None) return self - def _prepend_dict(self, dict_obj: dict, prepend: object) -> dict: + def _prepend_dict(self, dict_obj: dict, prepend: str) -> dict: # Internal method to rename dict keys new_dict = {} @@ -905,7 +924,7 @@ def _prepend_dict(self, dict_obj: dict, prepend: object) -> dict: return new_dict - def stack(self, *tables, missing=None) -> None: + def stack(self, *tables, missing: object = None) -> None: """ Stack Parsons tables on top of one another. @@ -913,20 +932,20 @@ def stack(self, *tables, missing=None) -> None: different tables. Args: - tables: Parsons Table or list + tables A single table, or a list of tables. - missing: bool + missing: object The value to use when padding missing values. """ if type(tables) not in [list, tuple]: tables = [tables] - petl_tables = [tbl.table for tbl in tables] + petl_tables = [tbl.table for tbl in tables] self.table = petl.stack(self.table, *petl_tables, missing=missing) - def concat(self, *tables, missing=None): + def concat(self, *tables, missing: object = None) -> None: """ Concatenates one or more tables onto this one. @@ -935,23 +954,20 @@ def concat(self, *tables, missing=None): missing keyword argument. Args: - tables: Parsons Table or list + tables A single table, or a list of tables. - missing: bool + missing: object The value to use when padding missing values. - Returns: - None. - """ if type(tables) not in [list, tuple]: tables = [tables] - petl_tables = [tbl.table for tbl in tables] + petl_tables = [tbl.table for tbl in tables] self.table = petl.cat(self.table, *petl_tables, missing=missing) - def chunk(self, rows: int): + def chunk(self, rows: int) -> list[ETL]: """ Divides a Parsons table into smaller tables of a specified row count. If the table cannot be divided evenly, then the final table will only include the remainder. @@ -960,12 +976,12 @@ def chunk(self, rows: int): rows: int The number of rows of each new Parsons table. - Returns: - List of Parsons tables. + Returns: list[parsons.etl.table.Table] + A list of Parsons tables. """ - from parsons.etl import Table + from parsons.etl.table import Table return [ Table(petl.rowslice(self.table, i, i + rows)) for i in range(0, self.num_rows, rows) @@ -980,9 +996,8 @@ def get_normalized_column_name(column_name: str) -> str: Args: column_name: str - Returns: - str - Normalized column name. + Returns: str + Normalized column name. """ @@ -992,10 +1007,10 @@ def get_normalized_column_name(column_name: str) -> str: def match_columns( self, desired_columns: list[str], - fuzzy_match=True, + fuzzy_match: bool = True, if_extra_columns: Literal["remove", "ignore", "fail"] = "remove", if_missing_columns: Literal["add", "ignore", "fail"] = "add", - ): + ) -> ETL: """ Changes the column names and ordering in this Table to match a list of desired column names. @@ -1016,12 +1031,12 @@ def match_columns( If the Table is missing some of the desired columns, either 'add' them (with a value of None), 'ignore' them, or 'fail' (raising an error). - Returns: - Parsons.Table and also updates self. + Returns: ETL + The modified ETL (self). """ - from parsons.etl import Table # Just trying to avoid recursive imports. + from parsons.etl.table import Table # Just trying to avoid recursive imports. normalize_fn = Table.get_normalized_column_name if fuzzy_match else (lambda s: s) @@ -1097,8 +1112,13 @@ def match_columns( return self def reduce_rows( - self, columns: list[str], reduce_func, headers: list[str], presorted=False, **kwargs - ): + self, + columns: list[str], + reduce_func: Callable[[list[str], list[object]], list[object]], + headers: list[str], + presorted: bool = False, + **kwargs, + ) -> ETL: """ Group rows by a column or columns, then reduce the groups to a single row. @@ -1157,18 +1177,19 @@ def reduce_rows( Args: columns: list[str] The column(s) by which to group the rows. - reduce_func: fun + reduce_func: Callable[[list[str], list[object]], list[object]] The function by which to reduce the rows. Should take the 2 arguments, the columns list and the rows list and return a list. - `reducer(columns: list[str], rows: list[object]) -> list[object];`. headers: list[str] The list of headers for modified table. The length should match the length of the list returned by the reduce function. presorted: bool If false, the row will be sorted. + kwargs: + Optionally, the buffersize, tempdir, or cache to use. - Returns: - Parsons.Table and also updates self. + Returns: ETL + The modified ETL (self). """ @@ -1183,7 +1204,7 @@ def reduce_rows( return self - def sort(self, columns: list[str] | str | None = None, reverse=False): + def sort(self, columns: list[str] | str | None = None, reverse: bool = False) -> ETL: """ Sort the rows a table. @@ -1194,8 +1215,8 @@ def sort(self, columns: list[str] | str | None = None, reverse=False): reverse: boolean Sort rows in reverse order. - Returns: - Parsons.Table and also updates self. + Returns: ETL + The modified ETL (self). """ @@ -1203,7 +1224,7 @@ def sort(self, columns: list[str] | str | None = None, reverse=False): return self - def set_header(self, new_header: list[str]): + def set_header(self, new_header: list[str]) -> ETL: """ Replace the header row of the table. @@ -1211,14 +1232,14 @@ def set_header(self, new_header: list[str]): new_header: list[str] List of new header column names. - Returns: - Parsons.Table and also updates self. + Returns: ETL + The modified ETL (self). """ self.table = petl.setheader(self.table, new_header) return self - def use_petl(self, petl_method, *args, **kwargs): + def use_petl(self, petl_method: str, *args, **kwargs) -> ETL: """ Call a petl function on the current table. @@ -1260,15 +1281,18 @@ def use_petl(self, petl_method, *args, **kwargs): to_petl: bool If True, returns a petl table, otherwise a Parsons.Table. Defaults to False. - `*args`: Any - The arguements to pass to the petl function. - `**kwargs`: Any - The keyword arguements to pass to the petl function. + args: + The arguments to pass to the petl function. + kwargs: + The keyword arguments to pass to the petl function. - Returns: - Parsons.Table or petl table. + Returns: parsons.etl.table.Table + A Parsons table. """ + + from parsons.etl import Table + update_table = kwargs.pop("update_table", False) to_petl = kwargs.pop("to_petl", False) @@ -1278,9 +1302,9 @@ def use_petl(self, petl_method, *args, **kwargs): if to_petl: return getattr(petl, petl_method)(self.table, *args, **kwargs) - return parsons.etl.table.Table(getattr(petl, petl_method)(self.table, *args, **kwargs)) + return Table(getattr(petl, petl_method)(self.table, *args, **kwargs)) - def deduplicate(self, keys=None, presorted=False): + def deduplicate(self, keys: list[str] | str | None = None, presorted: bool = False) -> ETL: """ Deduplicates table based on an optional keys argument, which can contain any number of keys or None. @@ -1353,13 +1377,13 @@ def deduplicate(self, keys=None, presorted=False): +---+---+ Args: - keys: str or list[str] or None + keys: list[str] | str | None keys to deduplicate (and optionally sort) on. presorted: bool If false, the row will be sorted. - Returns: - Parsons Table and also updates self. + Returns: ETL + The modified ETL (self). """ From 8404260d0aecb92a70c853079a88fed83255e055 Mon Sep 17 00:00:00 2001 From: crusopaul Date: Tue, 3 Mar 2026 19:07:45 -0500 Subject: [PATCH 7/8] Update etl.py --- parsons/etl/etl.py | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index aa3b31ab17..690f9afd51 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Literal, TypeVar +from typing import TYPE_CHECKING, Any, Literal, TypeVar if TYPE_CHECKING: from collections.abc import Callable @@ -50,7 +50,7 @@ def tail(self, n: int = 5) -> ETL: def add_column( self, column: str, - value: object = None, + value: Any = None, index: int | None = None, if_exists: Literal["fail", "replace"] = "fail", ) -> ETL: @@ -60,7 +60,7 @@ def add_column( Args: column: str Name of column to add. - value: object + value: Any A fixed or calculated value. index: int Optionally, the position of the new column in the table. Default behavior @@ -150,14 +150,14 @@ def rename_columns(self, column_map: dict) -> ETL: self.table = petl.rename(self.table, column_map) return self - def fill_column(self, column_name: str, fill_value: Callable[[object], object] | object) -> ETL: + def fill_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) -> ETL: """ Fill a column in a table. Args: column_name: str The column to fill. - fill_value: Callable[[object], object] | object + fill_value: Callable[[Any], Any] | Any A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. @@ -175,16 +175,14 @@ def fill_column(self, column_name: str, fill_value: Callable[[object], object] | return self - def fillna_column( - self, column_name: str, fill_value: Callable[[object], object] | object - ) -> ETL: + def fillna_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) -> ETL: """ Fill None values in a column in a table. Args: column_name: str The column to fill. - fill_value: Callable[[object], object] | object + fill_value: Callable[[Any], Any] | Any A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. @@ -321,7 +319,7 @@ def coalesce_columns( if dest_column in self.columns: - def convert_fn(value: object, row: dict) -> object: + def convert_fn(value: Any, row: dict) -> Any: for source_col in source_columns: if row.get(source_col): return row[source_col] @@ -331,7 +329,7 @@ def convert_fn(value: object, row: dict) -> object: else: - def add_fn(row: dict) -> object: + def add_fn(row: dict) -> Any: for source_col in source_columns: if row.get(source_col): return row[source_col] @@ -496,7 +494,7 @@ def unpack_dict( keys: list[str] | None = None, include_original: bool = False, sample_size: int = 5000, - missing: object = None, + missing: Any = None, prepend: bool = True, prepend_value: str | None = None, ) -> ETL: @@ -513,7 +511,7 @@ def unpack_dict( Retain original column after unpacking. sample_size: int Number of rows to sample before determining columns. - missing: object + missing: Any If a value is missing, the value to fill it with. prepend: bool Prepend the column name of the unpacked values. Useful for @@ -890,7 +888,7 @@ def select_rows(self, *filters) -> ETL: return Table(petl.select(self.table, *filters)) - def remove_null_rows(self, columns: list[str] | str, null_value: object = None) -> ETL: + def remove_null_rows(self, columns: list[str] | str, null_value: Any = None) -> ETL: """ Remove rows if the values in a column are None. If multiple columns are passed as list, it will remove all rows with null values in any @@ -899,7 +897,7 @@ def remove_null_rows(self, columns: list[str] | str, null_value: object = None) Args: column: list[str] | str The column or columns to analyze. - null_value: object + null_value: Any The null value. Returns: ETL @@ -924,7 +922,7 @@ def _prepend_dict(self, dict_obj: dict, prepend: str) -> dict: return new_dict - def stack(self, *tables, missing: object = None) -> None: + def stack(self, *tables, missing: Any = None) -> None: """ Stack Parsons tables on top of one another. @@ -934,7 +932,7 @@ def stack(self, *tables, missing: object = None) -> None: Args: tables A single table, or a list of tables. - missing: object + missing: Any The value to use when padding missing values. """ @@ -945,7 +943,7 @@ def stack(self, *tables, missing: object = None) -> None: petl_tables = [tbl.table for tbl in tables] self.table = petl.stack(self.table, *petl_tables, missing=missing) - def concat(self, *tables, missing: object = None) -> None: + def concat(self, *tables, missing: Any = None) -> None: """ Concatenates one or more tables onto this one. @@ -956,7 +954,7 @@ def concat(self, *tables, missing: object = None) -> None: Args: tables A single table, or a list of tables. - missing: object + missing: Any The value to use when padding missing values. """ @@ -1114,7 +1112,7 @@ def match_columns( def reduce_rows( self, columns: list[str], - reduce_func: Callable[[list[str], list[object]], list[object]], + reduce_func: Callable[[list[str], list], list], headers: list[str], presorted: bool = False, **kwargs, @@ -1177,7 +1175,7 @@ def reduce_rows( Args: columns: list[str] The column(s) by which to group the rows. - reduce_func: Callable[[list[str], list[object]], list[object]] + reduce_func: Callable[[list[str], list], list] The function by which to reduce the rows. Should take the 2 arguments, the columns list and the rows list and return a list. headers: list[str] From 4f8a458a384fcd56fab3df13092e2a6928f16213 Mon Sep 17 00:00:00 2001 From: crusopaul Date: Tue, 3 Mar 2026 19:48:00 -0500 Subject: [PATCH 8/8] fix doc returns & add raises --- parsons/etl/etl.py | 154 +++++++++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 68 deletions(-) diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py index 690f9afd51..fbdb3d8c3c 100644 --- a/parsons/etl/etl.py +++ b/parsons/etl/etl.py @@ -23,8 +23,8 @@ def head(self, n: int = 5) -> ETL: n: int The number of rows to return. Defaults to 5. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -39,8 +39,8 @@ def tail(self, n: int = 5) -> ETL: n: int The number of rows to select. Defaults to 5. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -69,8 +69,11 @@ def add_column( If set "replace", this function will call fill_column. If the column already exists, rather than raising a ValueError. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). + + Raises: + ValueError: The column already exists. """ @@ -92,8 +95,8 @@ def remove_column(self, *columns) -> ETL: columns: Column name(s). - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -110,8 +113,12 @@ def rename_column(self, column_name: str, new_column_name: str) -> ETL: new_column_name: str The new column name. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). + + Raises: + ValueError: + Column already exists. """ @@ -134,8 +141,14 @@ def rename_columns(self, column_map: dict) -> ETL: {'old_name': 'new_name', 'old_name2': 'new_name2'} - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). + + Raises: + KeyError: + Old column does not exist. + ValueError: + New column already exists. """ @@ -161,8 +174,8 @@ def fill_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any) A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -186,8 +199,8 @@ def fillna_column(self, column_name: str, fill_value: Callable[[Any], Any] | Any A conversion function taking a single argument and returning the converted value. Alternatively, a fixed or calculated value. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -219,8 +232,8 @@ def move_column(self, column: str, index: int) -> ETL: index: int The new index for the column. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -239,8 +252,8 @@ def convert_column(self, *column, **kwargs) -> ETL: kwargs: The update function, method, or variable to process the update. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -255,8 +268,8 @@ def get_column_max_width(self, column: str) -> int: column: str The column name. - Returns: int - The max width. + Returns: + int: The max width. """ @@ -273,8 +286,8 @@ def convert_columns_to_str(self) -> ETL: Convenience function to convert all non-string or mixed columns in a Parsons table to string (e.g. for comparison.) - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -312,8 +325,8 @@ def coalesce_columns( destination column is also one of the source columns, it will not be removed. Defaults to True. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -375,8 +388,8 @@ def map_columns(self, column_map: dict, exact_match: bool = True) -> ETL: Optionally, if True will only map if an exact match. If False will ignore case, spaces and underscores. Defaults to True. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -423,8 +436,8 @@ def map_and_coalesce_columns(self, column_map: dict) -> ETL: column_map: dict A dictionary of columns and possible values that map to it. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -451,8 +464,8 @@ def get_column_types(self, column: str) -> list[TypeVar]: column: str Name of the column to analyze. - Returns: list[TypeVar] - A list of Python types. + Returns: + list[TypeVar]: A list of Python types. """ @@ -462,8 +475,8 @@ def get_columns_type_stats(self) -> list[dict]: """ Return descriptive stats for all columns. - Returns: list[dict] - A list of dicts, each containing a column 'name' and a 'type' list. + Returns: + list[dict]: A list of dicts, each containing a column 'name' and a 'type' list. """ @@ -480,8 +493,8 @@ def convert_table(self, *args) -> ETL: args: The update function, method, or variable to process the update. Can also... - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -520,8 +533,8 @@ def unpack_dict( Value to prepend new columns if prepend=True. If None, will set to column name. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -589,8 +602,8 @@ def unpack_list( max_columns: int Optionally, the maximum number of columns to unpack. - Returns: petl.util.base.Table - The new table or returns None if table is being replaced. + Returns: + petl.util.base.Table: The new table or returns None if table is being replaced. """ @@ -646,8 +659,8 @@ def unpack_nested_columns_as_rows( If False (default): Return unpacked rows (with key column only) as standalone. Removes packed list and dict rows from original either way. - Returns: parsons.etl.table.Table - If expand_original, original table with packed rows replaced + Returns: + parsons.etl.table.Table: If expand_original, original table with packed rows replaced by unpacked rows. Otherwise, standalone table with key column and unpacked values only. """ @@ -799,8 +812,8 @@ def long_table( Value to prepend new columns if prepend=True. If None, will set to column name. - Returns: parsons.etl.table.Table - The Parsons Table. + Returns: + parsons.etl.table.Table: The Parsons Table. """ @@ -836,8 +849,8 @@ def cut(self, *columns) -> ETL: columns Columns in the parsons table. - Returns: parsons.etl.table.Table - The Parsons Table. + Returns: + parsons.etl.table.Table: The Parsons Table. """ @@ -879,8 +892,8 @@ def select_rows(self, *filters) -> ETL: filters: Function or str. - Returns: parsons.etl.table.Table - A new parsons table containing the selected rows. + Returns: + parsons.etl.table.Table: A new parsons table containing the selected rows. """ @@ -900,8 +913,8 @@ def remove_null_rows(self, columns: list[str] | str, null_value: Any = None) -> null_value: Any The null value. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ if isinstance(columns, str): @@ -974,8 +987,8 @@ def chunk(self, rows: int) -> list[ETL]: rows: int The number of rows of each new Parsons table. - Returns: list[parsons.etl.table.Table] - A list of Parsons tables. + Returns: + list[parsons.etl.table.Table]: A list of Parsons tables. """ @@ -994,8 +1007,8 @@ def get_normalized_column_name(column_name: str) -> str: Args: column_name: str - Returns: str - Normalized column name. + Returns: + str: Normalized column name. """ @@ -1022,15 +1035,20 @@ def match_columns( Eg. With this flag set, "FIRST NAME" would match "first_name". If the Table has two columns that normalize to the same string (eg. "FIRST NAME" and "first_name"), the latter will be considered an extra column. - if_extra_columns: string + if_extra_columns: str If the Table has columns that don't match any desired columns, either 'remove' them, 'ignore' them, or 'fail' (raising an error). - if_missing_columns: string + if_missing_columns: str If the Table is missing some of the desired columns, either 'add' them (with a value of None), 'ignore' them, or 'fail' (raising an error). - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). + + Raises: + TypeError: + Strategy was to fail or an invalid strategy was passed to if_extra_columns or + if_missing_columns. """ @@ -1186,8 +1204,8 @@ def reduce_rows( kwargs: Optionally, the buffersize, tempdir, or cache to use. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -1213,8 +1231,8 @@ def sort(self, columns: list[str] | str | None = None, reverse: bool = False) -> reverse: boolean Sort rows in reverse order. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ @@ -1230,8 +1248,8 @@ def set_header(self, new_header: list[str]) -> ETL: new_header: list[str] List of new header column names. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """ self.table = petl.setheader(self.table, new_header) @@ -1284,8 +1302,8 @@ def use_petl(self, petl_method: str, *args, **kwargs) -> ETL: kwargs: The keyword arguments to pass to the petl function. - Returns: parsons.etl.table.Table - A Parsons table. + Returns: + parsons.etl.table.Table: A Parsons table. """ @@ -1380,8 +1398,8 @@ def deduplicate(self, keys: list[str] | str | None = None, presorted: bool = Fal presorted: bool If false, the row will be sorted. - Returns: ETL - The modified ETL (self). + Returns: + ETL: The modified ETL (self). """