1-
21import numpy as np
32import polars as pl
43import polars .selectors as cs
54
6-
75from .datetime import get_timedelta_str , get_timestamp_column
86
97# Pre-compiled regex patterns (identical to original)
1311BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
1412DATETIME_REGEX = (
1513 r"^("
16- r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
14+ r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
1715 r"|"
18- r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
16+ r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
1917 r"|"
20- r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
18+ r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
2119 r"|"
22- r"\d{8}" # Compact: 20231231
20+ r"\d{8}" # Compact: 20231231
2321 r")"
2422 r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
25- r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
23+ r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
2624 r"$"
2725)
2826
3129F32_MAX = float (np .finfo (np .float32 ).max )
3230
3331
34-
3532def _clean_string_expr (col_name : str ) -> pl .Expr :
3633 """Create expression to clean string values."""
3734 return (
38- pl .col (col_name )
39- .str .strip_chars ()
40- .replace ({"-" : None , "" : None , "None" : None })
35+ pl .col (col_name ).str .strip_chars ().replace ({"-" : None , "" : None , "None" : None })
4136 )
4237
4338
@@ -46,7 +41,7 @@ def _can_downcast_to_float32(series: pl.Series) -> bool:
4641 finite_values = series .filter (series .is_finite ())
4742 if finite_values .is_empty ():
4843 return True
49-
44+
5045 min_val , max_val = finite_values .min (), finite_values .max ()
5146 return F32_MIN <= min_val <= max_val <= F32_MAX
5247
@@ -55,83 +50,87 @@ def _optimize_numeric_column(series: pl.Series, col_name: str, shrink: bool) ->
5550 """Optimize numeric column types."""
5651 if not shrink :
5752 return pl .col (col_name )
58-
53+
5954 if series .dtype == pl .Float64 and not _can_downcast_to_float32 (series ):
6055 return pl .col (col_name )
61-
56+
6257 return pl .col (col_name ).shrink_dtype ()
6358
6459
6560def _optimize_string_column (
66- series : pl .Series ,
67- col_name : str ,
68- shrink_numerics : bool ,
69- time_zone : str | None = None
61+ series : pl .Series ,
62+ col_name : str ,
63+ shrink_numerics : bool ,
64+ time_zone : str | None = None ,
7065) -> pl .Expr :
7166 """Convert string column to appropriate type based on content analysis."""
7267 # Return early for empty or null-only series
7368 cleaned_expr = _clean_string_expr (col_name )
7469 non_null = series .drop_nulls ().replace ({"-" : None , "" : None , "None" : None })
7570 if len (non_null ) == 0 :
7671 return pl .col (col_name ).cast (pl .Int8 )
77-
72+
7873 stripped = non_null .str .strip_chars ()
7974 lowercase = stripped .str .to_lowercase ()
80-
75+
8176 # Check for boolean values
8277 if lowercase .str .contains (BOOLEAN_REGEX ).all ():
83- return cleaned_expr .str .to_lowercase ().str .contains (BOOLEAN_TRUE_REGEX ).alias (col_name )
84-
78+ return (
79+ cleaned_expr .str .to_lowercase ()
80+ .str .contains (BOOLEAN_TRUE_REGEX )
81+ .alias (col_name )
82+ )
83+
8584 elif stripped .str .contains (INTEGER_REGEX ).all ():
8685 int_expr = cleaned_expr .cast (pl .Int64 )
87- return int_expr .shrink_dtype ().alias (col_name ) if shrink_numerics else int_expr .alias (col_name )
88-
86+ return (
87+ int_expr .shrink_dtype ().alias (col_name )
88+ if shrink_numerics
89+ else int_expr .alias (col_name )
90+ )
91+
8992 # Check for numeric values
9093 elif stripped .str .contains (FLOAT_REGEX ).all ():
9194 float_expr = cleaned_expr .str .replace_all ("," , "." ).cast (pl .Float64 )
9295
93-
9496 if shrink_numerics :
9597 # Check if values can fit in Float32
96- temp_floats = stripped .str .replace_all ("," , "." ).cast (pl .Float64 , strict = False )
98+ temp_floats = stripped .str .replace_all ("," , "." ).cast (
99+ pl .Float64 , strict = False
100+ )
97101 if _can_downcast_to_float32 (temp_floats ):
98102 return float_expr .shrink_dtype ().alias (col_name )
99-
103+
100104 return float_expr .alias (col_name )
101105
102106 try :
103107 if stripped .str .contains (DATETIME_REGEX ).all ():
104108 return cleaned_expr .str .to_datetime (
105- strict = False ,
106- time_unit = "us" ,
107- time_zone = time_zone
109+ strict = False , time_unit = "us" , time_zone = time_zone
108110 ).alias (col_name )
109111 except pl .exceptions .PolarsError :
110112 pass
111-
113+
112114 # Keep original if no conversion applies
113115 return pl .col (col_name )
114116
115117
116118def _get_column_expr (
117- df : pl .DataFrame ,
118- col_name : str ,
119- shrink_numerics : bool ,
120- time_zone : str | None = None
119+ df : pl .DataFrame , col_name : str , shrink_numerics : bool , time_zone : str | None = None
121120) -> pl .Expr :
122121 """Generate optimization expression for a single column."""
123122 series = df [col_name ]
124-
123+
125124 # Handle all-null columns
126125 if series .is_null ().all ():
127126 return pl .col (col_name ).cast (pl .Int8 )
128-
127+
129128 # Process based on current type
130129 if series .dtype .is_numeric ():
131130 return _optimize_numeric_column (series , col_name , shrink_numerics )
132131 elif series .dtype == pl .Utf8 :
133132 return _optimize_string_column (series , col_name , shrink_numerics , time_zone )
134-
133+
135134 # Keep original for other types
136135 return pl .col (col_name )
137136
@@ -145,18 +144,18 @@ def opt_dtype(
145144) -> pl .DataFrame :
146145 """
147146 Optimize data types of a Polars DataFrame for performance and memory efficiency.
148-
147+
149148 This function analyzes each column and converts it to the most appropriate
150149 data type based on content, handling string-to-type conversions and
151150 numeric type downcasting.
152-
151+
153152 Args:
154153 df: DataFrame to optimize
155154 include: Column(s) to include in optimization (default: all columns)
156155 exclude: Column(s) to exclude from optimization
157156 time_zone: Optional time zone for datetime parsing
158157 shrink_numerics: Whether to downcast numeric types when possible
159-
158+
160159 Returns:
161160 DataFrame with optimized data types
162161 """
@@ -165,20 +164,20 @@ def opt_dtype(
165164 include = [include ]
166165 if isinstance (exclude , str ):
167166 exclude = [exclude ]
168-
167+
169168 # Determine columns to process
170169 cols_to_process = df .columns
171170 if include :
172171 cols_to_process = [col for col in include if col in df .columns ]
173172 if exclude :
174173 cols_to_process = [col for col in cols_to_process if col not in exclude ]
175-
174+
176175 # Generate optimization expressions for all columns
177176 expressions = [
178177 _get_column_expr (df , col_name , shrink_numerics , time_zone )
179178 for col_name in cols_to_process
180179 ]
181-
180+
182181 # Apply all transformations at once if any exist
183182 return df if not expressions else df .with_columns (expressions )
184183
0 commit comments