Skip to content

Commit d9bcc7f

Browse files
authored
Merge branch 'main' into dev/bug/frequencyCollisions
2 parents 12ef20a + 9bd352d commit d9bcc7f

File tree

5 files changed

+87
-25
lines changed

5 files changed

+87
-25
lines changed

doc/source/development/contributing_codebase.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ Preferred ``pytest`` idioms
537537
test and does not check if the test will fail. If this is the behavior you desire, use ``pytest.skip`` instead.
538538

539539
If a test is known to fail but the manner in which it fails
540-
is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that
540+
is not meant to be captured, use ``pytest.mark.xfail``. It is common to use this method for a test that
541541
exhibits buggy behavior or a non-implemented feature. If
542542
the failing test has flaky behavior, use the argument ``strict=False``. This
543543
will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort.

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,7 @@ I/O
742742
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
743743
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
744744
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
745+
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
745746
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
746747
- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
747748
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

pandas/io/excel/_base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@
197197
False otherwise. An example of a valid callable argument would be ``lambda
198198
x: x in [0, 2]``.
199199
nrows : int, default None
200-
Number of rows to parse.
200+
Number of rows to parse. Does not include header rows.
201201
na_values : scalar, str, list-like, or dict, default None
202202
Additional strings to recognize as NA/NaN. If dict passed, specific
203203
per-column NA values. By default the following values are interpreted

pandas/io/json/_json.py

+56-22
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,12 @@
3232
from pandas.core.dtypes.common import (
3333
ensure_str,
3434
is_string_dtype,
35+
pandas_dtype,
3536
)
3637
from pandas.core.dtypes.dtypes import PeriodDtype
3738

3839
from pandas import (
40+
ArrowDtype,
3941
DataFrame,
4042
Index,
4143
MultiIndex,
@@ -942,29 +944,61 @@ def read(self) -> DataFrame | Series:
942944
obj: DataFrame | Series
943945
with self:
944946
if self.engine == "pyarrow":
945-
pyarrow_json = import_optional_dependency("pyarrow.json")
946-
pa_table = pyarrow_json.read_json(self.data)
947-
return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
947+
obj = self._read_pyarrow()
948948
elif self.engine == "ujson":
949-
if self.lines:
950-
if self.chunksize:
951-
obj = concat(self)
952-
elif self.nrows:
953-
lines = list(islice(self.data, self.nrows))
954-
lines_json = self._combine_lines(lines)
955-
obj = self._get_object_parser(lines_json)
956-
else:
957-
data = ensure_str(self.data)
958-
data_lines = data.split("\n")
959-
obj = self._get_object_parser(self._combine_lines(data_lines))
960-
else:
961-
obj = self._get_object_parser(self.data)
962-
if self.dtype_backend is not lib.no_default:
963-
return obj.convert_dtypes(
964-
infer_objects=False, dtype_backend=self.dtype_backend
965-
)
966-
else:
967-
return obj
949+
obj = self._read_ujson()
950+
951+
return obj
952+
953+
def _read_pyarrow(self) -> DataFrame:
954+
"""
955+
Read JSON using the pyarrow engine.
956+
"""
957+
pyarrow_json = import_optional_dependency("pyarrow.json")
958+
options = None
959+
960+
if isinstance(self.dtype, dict):
961+
pa = import_optional_dependency("pyarrow")
962+
fields = []
963+
for field, dtype in self.dtype.items():
964+
pd_dtype = pandas_dtype(dtype)
965+
if isinstance(pd_dtype, ArrowDtype):
966+
fields.append((field, pd_dtype.pyarrow_dtype))
967+
968+
schema = pa.schema(fields)
969+
options = pyarrow_json.ParseOptions(
970+
explicit_schema=schema, unexpected_field_behavior="infer"
971+
)
972+
973+
pa_table = pyarrow_json.read_json(self.data, parse_options=options)
974+
df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
975+
976+
return df
977+
978+
def _read_ujson(self) -> DataFrame | Series:
979+
"""
980+
Read JSON using the ujson engine.
981+
"""
982+
obj: DataFrame | Series
983+
if self.lines:
984+
if self.chunksize:
985+
obj = concat(self)
986+
elif self.nrows:
987+
lines = list(islice(self.data, self.nrows))
988+
lines_json = self._combine_lines(lines)
989+
obj = self._get_object_parser(lines_json)
990+
else:
991+
data = ensure_str(self.data)
992+
data_lines = data.split("\n")
993+
obj = self._get_object_parser(self._combine_lines(data_lines))
994+
else:
995+
obj = self._get_object_parser(self.data)
996+
if self.dtype_backend is not lib.no_default:
997+
return obj.convert_dtypes(
998+
infer_objects=False, dtype_backend=self.dtype_backend
999+
)
1000+
else:
1001+
return obj
9681002

9691003
def _get_object_parser(self, json: str) -> DataFrame | Series:
9701004
"""

pandas/tests/io/json/test_pandas.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import datetime
22
from datetime import timedelta
3-
from io import StringIO
3+
from io import (
4+
BytesIO,
5+
StringIO,
6+
)
47
import json
58
import os
69
import sys
@@ -2184,6 +2187,30 @@ def test_read_json_dtype_backend(
21842187
# string_storage setting -> ignore that for checking the result
21852188
tm.assert_frame_equal(result, expected, check_column_type=False)
21862189

2190+
@td.skip_if_no("pyarrow")
2191+
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
2192+
def test_read_json_pyarrow_with_dtype(self):
2193+
dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"}
2194+
json = b'{"a": 1, "b": 2}\n'
2195+
2196+
df = read_json(
2197+
BytesIO(json),
2198+
dtype=dtype,
2199+
lines=True,
2200+
engine="pyarrow",
2201+
dtype_backend="pyarrow",
2202+
)
2203+
2204+
result = df.dtypes
2205+
expected = Series(
2206+
data=[
2207+
pd.ArrowDtype.construct_from_string("int32[pyarrow]"),
2208+
pd.ArrowDtype.construct_from_string("int64[pyarrow]"),
2209+
],
2210+
index=["a", "b"],
2211+
)
2212+
tm.assert_series_equal(result, expected)
2213+
21872214
@pytest.mark.parametrize("orient", ["split", "records", "index"])
21882215
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
21892216
# GH#50750

0 commit comments

Comments
 (0)