Skip to content

Commit 032fc53

Browse files
committed
Bump version to 0.11.6 and clean up regex comments in PyArrow and Polars helpers
1 parent e0588f0 commit 032fc53

File tree

3 files changed

+97
-107
lines changed

3 files changed

+97
-107
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ description = "A simple workflow framework. Hamilton + APScheduler = FlowerPower
44
authors = [{ name = "Volker L.", email = "[email protected]" }]
55
readme = "README.md"
66
requires-python = ">= 3.11"
7-
version = "0.11.5.8"
7+
version = "0.11.6"
88
keywords = [
99
"hamilton",
1010
"workflow",

src/flowerpower/plugins/io/helpers/polars.py

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1-
21
import numpy as np
32
import polars as pl
43
import polars.selectors as cs
54

6-
75
from .datetime import get_timedelta_str, get_timestamp_column
86

97
# Pre-compiled regex patterns (identical to original)
@@ -13,16 +11,16 @@
1311
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
1412
DATETIME_REGEX = (
1513
r"^("
16-
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
14+
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
1715
r"|"
18-
r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
16+
r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
1917
r"|"
20-
r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
18+
r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
2119
r"|"
22-
r"\d{8}" # Compact: 20231231
20+
r"\d{8}" # Compact: 20231231
2321
r")"
2422
r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
25-
r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
23+
r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
2624
r"$"
2725
)
2826

@@ -31,13 +29,10 @@
3129
F32_MAX = float(np.finfo(np.float32).max)
3230

3331

34-
3532
def _clean_string_expr(col_name: str) -> pl.Expr:
3633
"""Create expression to clean string values."""
3734
return (
38-
pl.col(col_name)
39-
.str.strip_chars()
40-
.replace({"-": None, "": None, "None": None})
35+
pl.col(col_name).str.strip_chars().replace({"-": None, "": None, "None": None})
4136
)
4237

4338

@@ -46,7 +41,7 @@ def _can_downcast_to_float32(series: pl.Series) -> bool:
4641
finite_values = series.filter(series.is_finite())
4742
if finite_values.is_empty():
4843
return True
49-
44+
5045
min_val, max_val = finite_values.min(), finite_values.max()
5146
return F32_MIN <= min_val <= max_val <= F32_MAX
5247

@@ -55,83 +50,87 @@ def _optimize_numeric_column(series: pl.Series, col_name: str, shrink: bool) ->
5550
"""Optimize numeric column types."""
5651
if not shrink:
5752
return pl.col(col_name)
58-
53+
5954
if series.dtype == pl.Float64 and not _can_downcast_to_float32(series):
6055
return pl.col(col_name)
61-
56+
6257
return pl.col(col_name).shrink_dtype()
6358

6459

6560
def _optimize_string_column(
66-
series: pl.Series,
67-
col_name: str,
68-
shrink_numerics: bool,
69-
time_zone: str | None = None
61+
series: pl.Series,
62+
col_name: str,
63+
shrink_numerics: bool,
64+
time_zone: str | None = None,
7065
) -> pl.Expr:
7166
"""Convert string column to appropriate type based on content analysis."""
7267
# Return early for empty or null-only series
7368
cleaned_expr = _clean_string_expr(col_name)
7469
non_null = series.drop_nulls().replace({"-": None, "": None, "None": None})
7570
if len(non_null) == 0:
7671
return pl.col(col_name).cast(pl.Int8)
77-
72+
7873
stripped = non_null.str.strip_chars()
7974
lowercase = stripped.str.to_lowercase()
80-
75+
8176
# Check for boolean values
8277
if lowercase.str.contains(BOOLEAN_REGEX).all():
83-
return cleaned_expr.str.to_lowercase().str.contains(BOOLEAN_TRUE_REGEX).alias(col_name)
84-
78+
return (
79+
cleaned_expr.str.to_lowercase()
80+
.str.contains(BOOLEAN_TRUE_REGEX)
81+
.alias(col_name)
82+
)
83+
8584
elif stripped.str.contains(INTEGER_REGEX).all():
8685
int_expr = cleaned_expr.cast(pl.Int64)
87-
return int_expr.shrink_dtype().alias(col_name) if shrink_numerics else int_expr.alias(col_name)
88-
86+
return (
87+
int_expr.shrink_dtype().alias(col_name)
88+
if shrink_numerics
89+
else int_expr.alias(col_name)
90+
)
91+
8992
# Check for numeric values
9093
elif stripped.str.contains(FLOAT_REGEX).all():
9194
float_expr = cleaned_expr.str.replace_all(",", ".").cast(pl.Float64)
9295

93-
9496
if shrink_numerics:
9597
# Check if values can fit in Float32
96-
temp_floats = stripped.str.replace_all(",", ".").cast(pl.Float64, strict=False)
98+
temp_floats = stripped.str.replace_all(",", ".").cast(
99+
pl.Float64, strict=False
100+
)
97101
if _can_downcast_to_float32(temp_floats):
98102
return float_expr.shrink_dtype().alias(col_name)
99-
103+
100104
return float_expr.alias(col_name)
101105

102106
try:
103107
if stripped.str.contains(DATETIME_REGEX).all():
104108
return cleaned_expr.str.to_datetime(
105-
strict=False,
106-
time_unit="us",
107-
time_zone=time_zone
109+
strict=False, time_unit="us", time_zone=time_zone
108110
).alias(col_name)
109111
except pl.exceptions.PolarsError:
110112
pass
111-
113+
112114
# Keep original if no conversion applies
113115
return pl.col(col_name)
114116

115117

116118
def _get_column_expr(
117-
df: pl.DataFrame,
118-
col_name: str,
119-
shrink_numerics: bool,
120-
time_zone: str | None = None
119+
df: pl.DataFrame, col_name: str, shrink_numerics: bool, time_zone: str | None = None
121120
) -> pl.Expr:
122121
"""Generate optimization expression for a single column."""
123122
series = df[col_name]
124-
123+
125124
# Handle all-null columns
126125
if series.is_null().all():
127126
return pl.col(col_name).cast(pl.Int8)
128-
127+
129128
# Process based on current type
130129
if series.dtype.is_numeric():
131130
return _optimize_numeric_column(series, col_name, shrink_numerics)
132131
elif series.dtype == pl.Utf8:
133132
return _optimize_string_column(series, col_name, shrink_numerics, time_zone)
134-
133+
135134
# Keep original for other types
136135
return pl.col(col_name)
137136

@@ -145,18 +144,18 @@ def opt_dtype(
145144
) -> pl.DataFrame:
146145
"""
147146
Optimize data types of a Polars DataFrame for performance and memory efficiency.
148-
147+
149148
This function analyzes each column and converts it to the most appropriate
150149
data type based on content, handling string-to-type conversions and
151150
numeric type downcasting.
152-
151+
153152
Args:
154153
df: DataFrame to optimize
155154
include: Column(s) to include in optimization (default: all columns)
156155
exclude: Column(s) to exclude from optimization
157156
time_zone: Optional time zone for datetime parsing
158157
shrink_numerics: Whether to downcast numeric types when possible
159-
158+
160159
Returns:
161160
DataFrame with optimized data types
162161
"""
@@ -165,20 +164,20 @@ def opt_dtype(
165164
include = [include]
166165
if isinstance(exclude, str):
167166
exclude = [exclude]
168-
167+
169168
# Determine columns to process
170169
cols_to_process = df.columns
171170
if include:
172171
cols_to_process = [col for col in include if col in df.columns]
173172
if exclude:
174173
cols_to_process = [col for col in cols_to_process if col not in exclude]
175-
174+
176175
# Generate optimization expressions for all columns
177176
expressions = [
178177
_get_column_expr(df, col_name, shrink_numerics, time_zone)
179178
for col_name in cols_to_process
180179
]
181-
180+
182181
# Apply all transformations at once if any exist
183182
return df if not expressions else df.with_columns(expressions)
184183

0 commit comments

Comments
 (0)