Skip to content

Commit 6e3d653

Browse files
authored
Refactor JobsAnonymizedRollup to improve numeric column handling and execution environment ID checks. Updated casting to prevent coercion errors in pandas and streamlined NaN checks for EE ID. (#351)
1 parent 60d7f32 commit 6e3d653

1 file changed

Lines changed: 10 additions & 3 deletions

File tree

metrics_utility/anonymized_rollups/jobs_anonymized_rollup.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,15 @@ def _convert_id_columns_to_strings(self, dataframe):
2828
numeric = pd.to_numeric(dataframe[col], errors='coerce')
2929
mask = numeric.notna()
3030
if mask.any():
31-
# Convert only the numeric rows to integer strings; leave NaN rows untouched
32-
dataframe.loc[mask, col] = numeric[mask].astype(int).astype(str)
31+
# Cast to object dtype first so that assigning string values does not
32+
# trigger a StringArray → int64 coercion error in pandas 2.2+/3.x,
33+
# which occurs when the original column dtype is int64 and pandas
34+
# tries to cast the assigned StringArray back to int64.
35+
dataframe[col] = dataframe[col].astype(object)
36+
# Use to_numpy(dtype=float) to escape any nullable Float64Dtype and
37+
# produce a plain numpy float array before converting to int strings,
38+
# ensuring the result is an object array rather than a StringArray.
39+
dataframe.loc[mask, col] = numeric[mask].to_numpy(dtype=float).astype(int).astype(str)
3340

3441
return dataframe
3542

@@ -663,7 +670,7 @@ def _get_collection_cache_key(self, row, installed_collections_data):
663670
Falls back to hashing the raw string for rows that have no EE id.
664671
"""
665672
ee_id = getattr(row, 'execution_environment_id', None)
666-
if ee_id is not None and not (isinstance(ee_id, float) and pd.isna(ee_id)):
673+
if ee_id is not None and not pd.isna(ee_id):
667674
return ('ee', int(ee_id))
668675
raw = installed_collections_data if isinstance(installed_collections_data, str) else str(installed_collections_data)
669676
return ('raw', hash(raw))

0 commit comments

Comments
 (0)