Skip to content

Commit 36ae10d

Browse files
ENH: Improved error message and raise new error for small-string NaN … (#60907)
ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ec) Co-authored-by: Jake Thomas Trevallion <[email protected]>
1 parent f1a4d76 commit 36ae10d

File tree

3 files changed

+41
-12
lines changed

3 files changed

+41
-12
lines changed

pandas/io/pytables.py

+9
Original file line numberDiff line numberDiff line change
@@ -3464,6 +3464,12 @@ def validate(self, other) -> None:
34643464
# Value of type "Optional[Any]" is not indexable [index]
34653465
oax = ov[i] # type: ignore[index]
34663466
if sax != oax:
3467+
if c == "values_axes" and sax.kind != oax.kind:
3468+
raise ValueError(
3469+
f"Cannot serialize the column [{oax.values[0]}] "
3470+
f"because its data contents are not [{sax.kind}] "
3471+
f"but [{oax.kind}] object dtype"
3472+
)
34673473
raise ValueError(
34683474
f"invalid combination of [{c}] on appending data "
34693475
f"[{sax}] vs current table [{oax}]"
@@ -5111,6 +5117,9 @@ def _maybe_convert_for_string_atom(
51115117
data = bvalues.copy()
51125118
data[mask] = nan_rep
51135119

5120+
if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
5121+
raise ValueError("NaN representation is too large for existing column size")
5122+
51145123
# see if we have a valid string type
51155124
inferred_type = lib.infer_dtype(data, skipna=False)
51165125
if inferred_type != "string":

pandas/tests/io/pytables/test_append.py

+29-6
Original file line numberDiff line numberDiff line change
@@ -818,12 +818,9 @@ def test_append_raise(setup_path):
818818
store.append("df", df)
819819
df["foo"] = "bar"
820820
msg = re.escape(
821-
"invalid combination of [values_axes] on appending data "
822-
"[name->values_block_1,cname->values_block_1,"
823-
"dtype->bytes24,kind->string,shape->(1, 30)] "
824-
"vs current table "
825-
"[name->values_block_1,cname->values_block_1,"
826-
"dtype->datetime64[s],kind->datetime64[s],shape->None]"
821+
"Cannot serialize the column [foo] "
822+
"because its data contents are not [string] "
823+
"but [datetime64[s]] object dtype"
827824
)
828825
with pytest.raises(ValueError, match=msg):
829826
store.append("df", df)
@@ -989,3 +986,29 @@ def test_append_to_multiple_min_itemsize(setup_path):
989986
)
990987
result = store.select_as_multiple(["index", "nums", "strs"])
991988
tm.assert_frame_equal(result, expected, check_index_type=True)
989+
990+
991+
def test_append_string_nan_rep(setup_path):
992+
# GH 16300
993+
df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10))
994+
df_nan = df.copy()
995+
df_nan.loc[0:4, :] = np.nan
996+
msg = "NaN representation is too large for existing column size"
997+
998+
with ensure_clean_store(setup_path) as store:
999+
# string column too small
1000+
store.append("sa", df["A"])
1001+
with pytest.raises(ValueError, match=msg):
1002+
store.append("sa", df_nan["A"])
1003+
1004+
# nan_rep too big
1005+
store.append("sb", df["B"], nan_rep="bars")
1006+
with pytest.raises(ValueError, match=msg):
1007+
store.append("sb", df_nan["B"])
1008+
1009+
# smaller modified nan_rep
1010+
store.append("sc", df["A"], nan_rep="n")
1011+
store.append("sc", df_nan["A"])
1012+
result = store["sc"]
1013+
expected = concat([df["A"], df_nan["A"]])
1014+
tm.assert_series_equal(result, expected)

pandas/tests/io/pytables/test_round_trip.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,9 @@ def test_table_values_dtypes_roundtrip(setup_path):
213213

214214
# incompatible dtype
215215
msg = re.escape(
216-
"invalid combination of [values_axes] on appending data "
217-
"[name->values_block_0,cname->values_block_0,"
218-
"dtype->float64,kind->float,shape->(1, 3)] vs "
219-
"current table [name->values_block_0,"
220-
"cname->values_block_0,dtype->int64,kind->integer,"
221-
"shape->None]"
216+
"Cannot serialize the column [a] "
217+
"because its data contents are not [float] "
218+
"but [integer] object dtype"
222219
)
223220
with pytest.raises(ValueError, match=msg):
224221
store.append("df_i8", df1)

0 commit comments

Comments
 (0)