Skip to content

Commit 64e8f2c

Browse files
will-larkinWillAyd
andauthored
BUG: fix read_json ignoring the dtype with the pyarrow engine (#60997)
* fix: pass dtypes to read_json with pyarrow engine * fix: code checks * fix: commit checks * fix: commit checks * fix: commit checks * fic: formatting * fix: commit checks * feat: change type conversion * Update _json.py * Update _json.py * Update _json.py * Update _json.py * Update pandas/tests/io/json/test_pandas.py Co-authored-by: William Ayd <[email protected]> * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update _json.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py * Update test_pandas.py --------- Co-authored-by: William Ayd <[email protected]>
1 parent 882fa9c commit 64e8f2c

File tree

3 files changed

+85
-23
lines changed

3 files changed

+85
-23
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,7 @@ I/O
741741
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
742742
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
743743
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
744+
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
744745
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
745746
- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
746747
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

pandas/io/json/_json.py

+56-22
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,12 @@
3232
from pandas.core.dtypes.common import (
3333
ensure_str,
3434
is_string_dtype,
35+
pandas_dtype,
3536
)
3637
from pandas.core.dtypes.dtypes import PeriodDtype
3738

3839
from pandas import (
40+
ArrowDtype,
3941
DataFrame,
4042
Index,
4143
MultiIndex,
@@ -942,29 +944,61 @@ def read(self) -> DataFrame | Series:
942944
obj: DataFrame | Series
943945
with self:
944946
if self.engine == "pyarrow":
945-
pyarrow_json = import_optional_dependency("pyarrow.json")
946-
pa_table = pyarrow_json.read_json(self.data)
947-
return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
947+
obj = self._read_pyarrow()
948948
elif self.engine == "ujson":
949-
if self.lines:
950-
if self.chunksize:
951-
obj = concat(self)
952-
elif self.nrows:
953-
lines = list(islice(self.data, self.nrows))
954-
lines_json = self._combine_lines(lines)
955-
obj = self._get_object_parser(lines_json)
956-
else:
957-
data = ensure_str(self.data)
958-
data_lines = data.split("\n")
959-
obj = self._get_object_parser(self._combine_lines(data_lines))
960-
else:
961-
obj = self._get_object_parser(self.data)
962-
if self.dtype_backend is not lib.no_default:
963-
return obj.convert_dtypes(
964-
infer_objects=False, dtype_backend=self.dtype_backend
965-
)
966-
else:
967-
return obj
949+
obj = self._read_ujson()
950+
951+
return obj
952+
953+
def _read_pyarrow(self) -> DataFrame:
954+
"""
955+
Read JSON using the pyarrow engine.
956+
"""
957+
pyarrow_json = import_optional_dependency("pyarrow.json")
958+
options = None
959+
960+
if isinstance(self.dtype, dict):
961+
pa = import_optional_dependency("pyarrow")
962+
fields = []
963+
for field, dtype in self.dtype.items():
964+
pd_dtype = pandas_dtype(dtype)
965+
if isinstance(pd_dtype, ArrowDtype):
966+
fields.append((field, pd_dtype.pyarrow_dtype))
967+
968+
schema = pa.schema(fields)
969+
options = pyarrow_json.ParseOptions(
970+
explicit_schema=schema, unexpected_field_behavior="infer"
971+
)
972+
973+
pa_table = pyarrow_json.read_json(self.data, parse_options=options)
974+
df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
975+
976+
return df
977+
978+
def _read_ujson(self) -> DataFrame | Series:
979+
"""
980+
Read JSON using the ujson engine.
981+
"""
982+
obj: DataFrame | Series
983+
if self.lines:
984+
if self.chunksize:
985+
obj = concat(self)
986+
elif self.nrows:
987+
lines = list(islice(self.data, self.nrows))
988+
lines_json = self._combine_lines(lines)
989+
obj = self._get_object_parser(lines_json)
990+
else:
991+
data = ensure_str(self.data)
992+
data_lines = data.split("\n")
993+
obj = self._get_object_parser(self._combine_lines(data_lines))
994+
else:
995+
obj = self._get_object_parser(self.data)
996+
if self.dtype_backend is not lib.no_default:
997+
return obj.convert_dtypes(
998+
infer_objects=False, dtype_backend=self.dtype_backend
999+
)
1000+
else:
1001+
return obj
9681002

9691003
def _get_object_parser(self, json: str) -> DataFrame | Series:
9701004
"""

pandas/tests/io/json/test_pandas.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import datetime
22
from datetime import timedelta
3-
from io import StringIO
3+
from io import (
4+
BytesIO,
5+
StringIO,
6+
)
47
import json
58
import os
69
import sys
@@ -2184,6 +2187,30 @@ def test_read_json_dtype_backend(
21842187
# string_storage setting -> ignore that for checking the result
21852188
tm.assert_frame_equal(result, expected, check_column_type=False)
21862189

2190+
@td.skip_if_no("pyarrow")
2191+
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
2192+
def test_read_json_pyarrow_with_dtype(self):
2193+
dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"}
2194+
json = b'{"a": 1, "b": 2}\n'
2195+
2196+
df = read_json(
2197+
BytesIO(json),
2198+
dtype=dtype,
2199+
lines=True,
2200+
engine="pyarrow",
2201+
dtype_backend="pyarrow",
2202+
)
2203+
2204+
result = df.dtypes
2205+
expected = Series(
2206+
data=[
2207+
pd.ArrowDtype.construct_from_string("int32[pyarrow]"),
2208+
pd.ArrowDtype.construct_from_string("int64[pyarrow]"),
2209+
],
2210+
index=["a", "b"],
2211+
)
2212+
tm.assert_series_equal(result, expected)
2213+
21872214
@pytest.mark.parametrize("orient", ["split", "records", "index"])
21882215
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
21892216
# GH#50750

0 commit comments

Comments
 (0)