@@ -34,24 +34,22 @@ def _clean_string_expr(col_name: str) -> pl.Expr:
3434 return (
3535 pl .col (col_name )
3636 .str .strip_chars ()
37- .replace (
38- {
39- "-" : None ,
40- "" : None ,
41- "None" : None ,
42- "none" : None ,
43- "NONE" : None ,
44- "NaN" : None ,
45- "Nan" : None ,
46- "nan" : None ,
47- "NAN" : None ,
48- "N/A" : None ,
49- "n/a" : None ,
50- "null" : None ,
51- "Null" : None ,
52- "NULL" : None ,
53- }
54- )
37+ .replace ({
38+ "-" : None ,
39+ "" : None ,
40+ "None" : None ,
41+ "none" : None ,
42+ "NONE" : None ,
43+ "NaN" : None ,
44+ "Nan" : None ,
45+ "nan" : None ,
46+ "NAN" : None ,
47+ "N/A" : None ,
48+ "n/a" : None ,
49+ "null" : None ,
50+ "Null" : None ,
51+ "NULL" : None ,
52+ })
5553 )
5654
5755
@@ -79,7 +77,7 @@ def _optimize_numeric_column(
7977 # If all values are null, cast to Null type if allow_null is True
8078 if allow_null :
8179 return expr .cast (pl .Null ())
82-
80+
8381 if not allow_unsigned :
8482 # If unsigned types are not allowed, ensure we use signed integer types
8583 if dtype .is_integer () and not dtype .is_signed_integer ():
@@ -138,14 +136,16 @@ def _optimize_string_column(
138136 elif stripped .str .contains (INTEGER_REGEX ).all (ignore_nulls = False ):
139137 int_expr = cleaned_expr .cast (pl .Int64 ).alias (col_name )
140138 return (
141- int_expr .shrink_dtype ().alias (col_name )
142- if shrink_numerics
143- else int_expr .alias (col_name )
139+ int_expr .shrink_dtype ().alias (col_name )
140+ if shrink_numerics
141+ else int_expr .alias (col_name )
144142 )
145143
146144 # Check for numeric values
147145 elif stripped .str .contains (FLOAT_REGEX ).all (ignore_nulls = False ):
148- float_expr = cleaned_expr .str .replace_all ("," , "." ).cast (pl .Float64 ).alias (col_name )
146+ float_expr = (
147+ cleaned_expr .str .replace_all ("," , "." ).cast (pl .Float64 ).alias (col_name )
148+ )
149149
150150 if shrink_numerics :
151151 # Check if values can fit in Float32
@@ -188,9 +188,13 @@ def _get_column_expr(
188188
189189 # Process based on current type
190190 if series .dtype .is_numeric ():
191- return _optimize_numeric_column (series , shrink_numerics , allow_unsigned , allow_null )
191+ return _optimize_numeric_column (
192+ series , shrink_numerics , allow_unsigned , allow_null
193+ )
192194 elif series .dtype == pl .Utf8 :
193- return _optimize_string_column (series , col_name , shrink_numerics , time_zone , allow_null )
195+ return _optimize_string_column (
196+ series , col_name , shrink_numerics , time_zone , allow_null
197+ )
194198
195199 # Keep original for other types
196200 return pl .col (col_name )
@@ -423,41 +427,30 @@ def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None)
423427 def _unnest_all (struct_columns ):
424428 if fields is not None :
425429 return (
426- df .with_columns (
427- [
428- pl .col (col ).struct .rename_fields (
429- [
430- f"{ col } { seperator } { field_name } "
431- for field_name in df [col ].struct .fields
432- ]
433- )
434- for col in struct_columns
435- ]
436- )
430+ df .with_columns ([
431+ pl .col (col ).struct .rename_fields ([
432+ f"{ col } { seperator } { field_name } "
433+ for field_name in df [col ].struct .fields
434+ ])
435+ for col in struct_columns
436+ ])
437437 .unnest (struct_columns )
438438 .select (
439439 list (set (df .columns ) - set (struct_columns ))
440- + sorted (
441- [
442- f"{ col } { seperator } { field_name } "
443- for field_name in fields
444- for col in struct_columns
445- ]
446- )
440+ + sorted ([
441+ f"{ col } { seperator } { field_name } "
442+ for field_name in fields
443+ for col in struct_columns
444+ ])
447445 )
448446 )
449447
450- return df .with_columns (
451- [
452- pl .col (col ).struct .rename_fields (
453- [
454- f"{ col } { seperator } { field_name } "
455- for field_name in df [col ].struct .fields
456- ]
457- )
458- for col in struct_columns
459- ]
460- ).unnest (struct_columns )
448+ return df .with_columns ([
449+ pl .col (col ).struct .rename_fields ([
450+ f"{ col } { seperator } { field_name } " for field_name in df [col ].struct .fields
451+ ])
452+ for col in struct_columns
453+ ]).unnest (struct_columns )
461454
462455 struct_columns = [col for col in df .columns if df [col ].dtype == pl .Struct ] # noqa: F821
463456 while len (struct_columns ):
@@ -499,15 +492,13 @@ def with_strftime_columns(
499492 ]
500493 # print("timestamp_column, with_strftime_columns", timestamp_column)
501494 return opt_dtype (
502- df .with_columns (
503- [
504- pl .col (timestamp_column )
505- .dt .strftime (strftime_ )
506- .fill_null (0 )
507- .alias (column_name )
508- for strftime_ , column_name in zip (strftime , column_names )
509- ]
510- ),
495+ df .with_columns ([
496+ pl .col (timestamp_column )
497+ .dt .strftime (strftime_ )
498+ .fill_null (0 )
499+ .alias (column_name )
500+ for strftime_ , column_name in zip (strftime , column_names )
501+ ]),
511502 include = column_names ,
512503 strict = False ,
513504 )
@@ -542,12 +533,10 @@ def with_truncated_columns(
542533 truncate_by = [
543534 get_timedelta_str (truncate_ , to = "polars" ) for truncate_ in truncate_by
544535 ]
545- return df .with_columns (
546- [
547- pl .col (timestamp_column ).dt .truncate (truncate_ ).alias (column_name )
548- for truncate_ , column_name in zip (truncate_by , column_names )
549- ]
550- )
536+ return df .with_columns ([
537+ pl .col (timestamp_column ).dt .truncate (truncate_ ).alias (column_name )
538+ for truncate_ , column_name in zip (truncate_by , column_names )
539+ ])
551540
552541
553542def with_datepart_columns (
@@ -697,9 +686,9 @@ def cast_relaxed(
697686 columns = df .schema .names ()
698687 new_columns = [col for col in schema .names () if col not in columns ]
699688 if len (new_columns ):
700- return df .with_columns (
701- [ pl .lit (None ).alias (new_col ) for new_col in new_columns ]
702- ).cast (schema )
689+ return df .with_columns ([
690+ pl .lit (None ).alias (new_col ) for new_col in new_columns
691+ ] ).cast (schema )
703692 return df .cast (schema )
704693
705694
0 commit comments