Skip to content

Commit 394d2e2

Browse files
committed
Refactor string cleaning and optimize imports in PyArrow; improve readability of _clean_string_array function
1 parent 94e0202 commit 394d2e2

File tree

2 files changed

+66
-74
lines changed

2 files changed

+66
-74
lines changed

src/flowerpower/plugins/io/helpers/polars.py

Lines changed: 61 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,22 @@ def _clean_string_expr(col_name: str) -> pl.Expr:
3434
return (
3535
pl.col(col_name)
3636
.str.strip_chars()
37-
.replace(
38-
{
39-
"-": None,
40-
"": None,
41-
"None": None,
42-
"none": None,
43-
"NONE": None,
44-
"NaN": None,
45-
"Nan": None,
46-
"nan": None,
47-
"NAN": None,
48-
"N/A": None,
49-
"n/a": None,
50-
"null": None,
51-
"Null": None,
52-
"NULL": None,
53-
}
54-
)
37+
.replace({
38+
"-": None,
39+
"": None,
40+
"None": None,
41+
"none": None,
42+
"NONE": None,
43+
"NaN": None,
44+
"Nan": None,
45+
"nan": None,
46+
"NAN": None,
47+
"N/A": None,
48+
"n/a": None,
49+
"null": None,
50+
"Null": None,
51+
"NULL": None,
52+
})
5553
)
5654

5755

@@ -79,7 +77,7 @@ def _optimize_numeric_column(
7977
# If all values are null, cast to Null type if allow_null is True
8078
if allow_null:
8179
return expr.cast(pl.Null())
82-
80+
8381
if not allow_unsigned:
8482
# If unsigned types are not allowed, ensure we use signed integer types
8583
if dtype.is_integer() and not dtype.is_signed_integer():
@@ -138,14 +136,16 @@ def _optimize_string_column(
138136
elif stripped.str.contains(INTEGER_REGEX).all(ignore_nulls=False):
139137
int_expr = cleaned_expr.cast(pl.Int64).alias(col_name)
140138
return (
141-
int_expr.shrink_dtype().alias(col_name)
142-
if shrink_numerics
143-
else int_expr.alias(col_name)
139+
int_expr.shrink_dtype().alias(col_name)
140+
if shrink_numerics
141+
else int_expr.alias(col_name)
144142
)
145143

146144
# Check for numeric values
147145
elif stripped.str.contains(FLOAT_REGEX).all(ignore_nulls=False):
148-
float_expr = cleaned_expr.str.replace_all(",", ".").cast(pl.Float64).alias(col_name)
146+
float_expr = (
147+
cleaned_expr.str.replace_all(",", ".").cast(pl.Float64).alias(col_name)
148+
)
149149

150150
if shrink_numerics:
151151
# Check if values can fit in Float32
@@ -188,9 +188,13 @@ def _get_column_expr(
188188

189189
# Process based on current type
190190
if series.dtype.is_numeric():
191-
return _optimize_numeric_column(series, shrink_numerics, allow_unsigned, allow_null)
191+
return _optimize_numeric_column(
192+
series, shrink_numerics, allow_unsigned, allow_null
193+
)
192194
elif series.dtype == pl.Utf8:
193-
return _optimize_string_column(series, col_name, shrink_numerics, time_zone, allow_null)
195+
return _optimize_string_column(
196+
series, col_name, shrink_numerics, time_zone, allow_null
197+
)
194198

195199
# Keep original for other types
196200
return pl.col(col_name)
@@ -423,41 +427,30 @@ def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None)
423427
def _unnest_all(struct_columns):
424428
if fields is not None:
425429
return (
426-
df.with_columns(
427-
[
428-
pl.col(col).struct.rename_fields(
429-
[
430-
f"{col}{seperator}{field_name}"
431-
for field_name in df[col].struct.fields
432-
]
433-
)
434-
for col in struct_columns
435-
]
436-
)
430+
df.with_columns([
431+
pl.col(col).struct.rename_fields([
432+
f"{col}{seperator}{field_name}"
433+
for field_name in df[col].struct.fields
434+
])
435+
for col in struct_columns
436+
])
437437
.unnest(struct_columns)
438438
.select(
439439
list(set(df.columns) - set(struct_columns))
440-
+ sorted(
441-
[
442-
f"{col}{seperator}{field_name}"
443-
for field_name in fields
444-
for col in struct_columns
445-
]
446-
)
440+
+ sorted([
441+
f"{col}{seperator}{field_name}"
442+
for field_name in fields
443+
for col in struct_columns
444+
])
447445
)
448446
)
449447

450-
return df.with_columns(
451-
[
452-
pl.col(col).struct.rename_fields(
453-
[
454-
f"{col}{seperator}{field_name}"
455-
for field_name in df[col].struct.fields
456-
]
457-
)
458-
for col in struct_columns
459-
]
460-
).unnest(struct_columns)
448+
return df.with_columns([
449+
pl.col(col).struct.rename_fields([
450+
f"{col}{seperator}{field_name}" for field_name in df[col].struct.fields
451+
])
452+
for col in struct_columns
453+
]).unnest(struct_columns)
461454

462455
struct_columns = [col for col in df.columns if df[col].dtype == pl.Struct] # noqa: F821
463456
while len(struct_columns):
@@ -499,15 +492,13 @@ def with_strftime_columns(
499492
]
500493
# print("timestamp_column, with_strftime_columns", timestamp_column)
501494
return opt_dtype(
502-
df.with_columns(
503-
[
504-
pl.col(timestamp_column)
505-
.dt.strftime(strftime_)
506-
.fill_null(0)
507-
.alias(column_name)
508-
for strftime_, column_name in zip(strftime, column_names)
509-
]
510-
),
495+
df.with_columns([
496+
pl.col(timestamp_column)
497+
.dt.strftime(strftime_)
498+
.fill_null(0)
499+
.alias(column_name)
500+
for strftime_, column_name in zip(strftime, column_names)
501+
]),
511502
include=column_names,
512503
strict=False,
513504
)
@@ -542,12 +533,10 @@ def with_truncated_columns(
542533
truncate_by = [
543534
get_timedelta_str(truncate_, to="polars") for truncate_ in truncate_by
544535
]
545-
return df.with_columns(
546-
[
547-
pl.col(timestamp_column).dt.truncate(truncate_).alias(column_name)
548-
for truncate_, column_name in zip(truncate_by, column_names)
549-
]
550-
)
536+
return df.with_columns([
537+
pl.col(timestamp_column).dt.truncate(truncate_).alias(column_name)
538+
for truncate_, column_name in zip(truncate_by, column_names)
539+
])
551540

552541

553542
def with_datepart_columns(
@@ -697,9 +686,9 @@ def cast_relaxed(
697686
columns = df.schema.names()
698687
new_columns = [col for col in schema.names() if col not in columns]
699688
if len(new_columns):
700-
return df.with_columns(
701-
[pl.lit(None).alias(new_col) for new_col in new_columns]
702-
).cast(schema)
689+
return df.with_columns([
690+
pl.lit(None).alias(new_col) for new_col in new_columns
691+
]).cast(schema)
703692
return df.cast(schema)
704693

705694

src/flowerpower/plugins/io/helpers/pyarrow.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
import concurrent.futures
2+
13
import numpy as np
24
import polars as pl
35
import pyarrow as pa
46
import pyarrow.compute as pc
5-
import concurrent.futures
67

78
# Pre-compiled regex patterns (identical to original)
89
INTEGER_REGEX = r"^[-+]?\d+$"
@@ -406,7 +407,9 @@ def _optimize_string_array(
406407
else:
407408
return array.type
408409

409-
cleaned_array = _clean_string_array(array, allow_null) # pc.utf8_trim_whitespace(array)
410+
cleaned_array = _clean_string_array(
411+
array, allow_null
412+
) # pc.utf8_trim_whitespace(array)
410413

411414
try:
412415
if _all_match_regex(cleaned_array, BOOLEAN_REGEX):

0 commit comments

Comments
 (0)