Skip to content

Commit 2cb15e5

Browse files
fix issue with missing value in column labels
1 parent 0c5f862 commit 2cb15e5

File tree

2 files changed

+31
-12
lines changed

2 files changed

+31
-12
lines changed

python/pyarrow/pandas_compat.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, field_name):
174174
}
175175
string_dtype = 'object'
176176

177-
if name is not None and not isinstance(name, str):
177+
if (
178+
name is not None
179+
and not (isinstance(name, float) and np.isnan(name))
180+
and not isinstance(name, str)
181+
):
178182
raise TypeError(
179183
'Column name must be a string. Got column {} of type {}'.format(
180184
name, type(name).__name__
@@ -331,8 +335,8 @@ def _column_name_to_strings(name):
331335
return str(tuple(map(_column_name_to_strings, name)))
332336
elif isinstance(name, Sequence):
333337
raise TypeError("Unsupported type for MultiIndex level")
334-
elif name is None:
335-
return None
338+
elif name is None or (isinstance(name, float) and np.isnan(name)):
339+
return name
336340
return str(name)
337341

338342

@@ -1068,9 +1072,9 @@ def get_pandas_logical_type_map():
10681072
'date': 'datetime64[D]',
10691073
'datetime': 'datetime64[ns]',
10701074
'datetimetz': 'datetime64[ns]',
1071-
'unicode': np.str_,
1075+
'unicode': 'str',
10721076
'bytes': np.bytes_,
1073-
'string': np.str_,
1077+
'string': 'str',
10741078
'integer': np.int64,
10751079
'floating': np.float64,
10761080
'decimal': np.object_,

python/pyarrow/tests/test_pandas.py

+22-7
Original file line numberDiff line numberDiff line change
@@ -349,10 +349,18 @@ def test_integer_index_column(self):
349349
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
350350
_check_pandas_roundtrip(df, preserve_index=True)
351351

352-
def test_index_metadata_field_name(self, request):
353-
if _pandas_api.uses_string_dtype():
354-
# https://github.com/pandas-dev/pandas/issues/59879
355-
request.applymarker(pytest.mark.xfail(reason="bug in pandas string dtype"))
352+
def test_float_column_index_with_missing(self):
353+
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, np.nan])
354+
_check_pandas_roundtrip(df, preserve_index=True)
355+
356+
@pytest.mark.filterwarnings(
357+
"ignore:The DataFrame has column names of mixed type:UserWarning"
358+
)
359+
def test_string_column_index_with_missing(self):
360+
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None])
361+
_check_pandas_roundtrip(df, preserve_index=True)
362+
363+
def test_index_metadata_field_name(self):
356364
# test None case, and strangely named non-index columns
357365
df = pd.DataFrame(
358366
[(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
@@ -362,17 +370,24 @@ def test_index_metadata_field_name(self, request):
362370
),
363371
columns=['a', None, '__index_level_0__'],
364372
)
365-
with pytest.warns(UserWarning):
373+
if _pandas_api.uses_string_dtype():
366374
t = pa.Table.from_pandas(df, preserve_index=True)
375+
else:
376+
with pytest.warns(UserWarning):
377+
t = pa.Table.from_pandas(df, preserve_index=True)
367378
js = t.schema.pandas_metadata
368379

369380
col1, col2, col3, idx0, foo = js['columns']
370381

371382
assert col1['name'] == 'a'
372383
assert col1['name'] == col1['field_name']
373384

374-
assert col2['name'] is None
375-
assert col2['field_name'] == 'None'
385+
if _pandas_api.uses_string_dtype():
386+
assert np.isnan(col2['name'])
387+
assert col2['field_name'] == 'nan'
388+
else:
389+
assert col2['name'] is None
390+
assert col2['field_name'] == 'None'
376391

377392
assert col3['name'] == '__index_level_0__'
378393
assert col3['name'] == col3['field_name']

0 commit comments

Comments
 (0)