@@ -831,9 +831,7 @@ def table_to_dataframe(
831
831
}
832
832
833
833
834
- def _get_extension_dtypes (
835
- table , columns_metadata , types_mapper = None , options = None , categories = None
836
- ):
834
+ def _get_extension_dtypes (table , columns_metadata , types_mapper , options , categories ):
837
835
"""
838
836
Based on the stored column pandas metadata and the extension types
839
837
in the arrow schema, infer which columns should be converted to a
@@ -879,6 +877,11 @@ def _get_extension_dtypes(
879
877
pandas_dtype = _pandas_api .pandas_dtype (dtype )
880
878
if isinstance (pandas_dtype , _pandas_api .extension_dtype ):
881
879
if isinstance (pandas_dtype , _pandas_api .pd .StringDtype ):
880
+ # when the metadata indicate to use the string dtype,
881
+ # ignore this in case:
882
+ # - it is specified to convert strings / this column to categorical
883
+ # - the column itself is dictionary encoded and would otherwise be
884
+ # converted to categorical
882
885
if strings_to_categorical or name in categories :
883
886
continue
884
887
try :
@@ -1162,7 +1165,14 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
1162
1165
level .dtype == "str" and numpy_dtype == "object"
1163
1166
and ("mixed" in pandas_dtype or pandas_dtype in ["unicode" , "string" ])
1164
1167
):
1165
- # in this case don't convert to object dtype, but keep using the str dtype
1168
+ # the metadata indicate that the original dataframe used object dtype,
1169
+ # but ignore this and keep string dtype if:
1170
+ # - the original columns used mixed types -> we don't attempt to faithfully
1171
+ # roundtrip in this case, but keep the column names as strings
1172
+ # - the original columns were inferred to be strings but stored in object
1173
+ # dtype -> we don't restore the object dtype because all metadata
1174
+ # generated using pandas < 3 will have this case by default, and
1175
+ # for pandas >= 3 we want to use the default string dtype for .columns
1166
1176
new_levels .append (level )
1167
1177
continue
1168
1178
elif level .dtype != dtype :
0 commit comments