BUG: fix read_json ignoring the dtype with the pyarrow engine (#60997)

will-larkin · WillAyd · web-flow · commit 64e8f2c4d028 · 2025-03-19T11:12:52.000-07:00
* fix: pass dtypes to read_json with pyarrow engine

* fix: code checks

* fix: commit checks

* fix: commit checks

* fix: commit checks

* fic: formatting

* fix: commit checks

* feat: change type conversion

* Update _json.py

* Update _json.py

* Update _json.py

* Update _json.py

* Update pandas/tests/io/json/test_pandas.py

Co-authored-by: William Ayd &lt;william.ayd@icloud.com&gt;

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update _json.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

* Update test_pandas.py

---------

Co-authored-by: William Ayd &lt;william.ayd@icloud.com&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -741,6 +741,7 @@ I/O
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
+- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -32,10 +32,12 @@
 from pandas.core.dtypes.common import (
     ensure_str,
     is_string_dtype,
+    pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import PeriodDtype
 
 from pandas import (
+    ArrowDtype,
     DataFrame,
     Index,
     MultiIndex,
@@ -942,29 +944,61 @@ def read(self) -> DataFrame | Series:
         obj: DataFrame | Series
         with self:
             if self.engine == "pyarrow":
-                pyarrow_json = import_optional_dependency("pyarrow.json")
-                pa_table = pyarrow_json.read_json(self.data)
-                return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
+                obj = self._read_pyarrow()
             elif self.engine == "ujson":
-                if self.lines:
-                    if self.chunksize:
-                        obj = concat(self)
-                    elif self.nrows:
-                        lines = list(islice(self.data, self.nrows))
-                        lines_json = self._combine_lines(lines)
-                        obj = self._get_object_parser(lines_json)
-                    else:
-                        data = ensure_str(self.data)
-                        data_lines = data.split("\n")
-                        obj = self._get_object_parser(self._combine_lines(data_lines))
-                else:
-                    obj = self._get_object_parser(self.data)
-                if self.dtype_backend is not lib.no_default:
-                    return obj.convert_dtypes(
-                        infer_objects=False, dtype_backend=self.dtype_backend
-                    )
-                else:
-                    return obj
+                obj = self._read_ujson()
+
+        return obj
+
+    def _read_pyarrow(self) -> DataFrame:
+        """
+        Read JSON using the pyarrow engine.
+        """
+        pyarrow_json = import_optional_dependency("pyarrow.json")
+        options = None
+
+        if isinstance(self.dtype, dict):
+            pa = import_optional_dependency("pyarrow")
+            fields = []
+            for field, dtype in self.dtype.items():
+                pd_dtype = pandas_dtype(dtype)
+                if isinstance(pd_dtype, ArrowDtype):
+                    fields.append((field, pd_dtype.pyarrow_dtype))
+
+            schema = pa.schema(fields)
+            options = pyarrow_json.ParseOptions(
+                explicit_schema=schema, unexpected_field_behavior="infer"
+            )
+
+        pa_table = pyarrow_json.read_json(self.data, parse_options=options)
+        df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
+
+        return df
+
+    def _read_ujson(self) -> DataFrame | Series:
+        """
+        Read JSON using the ujson engine.
+        """
+        obj: DataFrame | Series
+        if self.lines:
+            if self.chunksize:
+                obj = concat(self)
+            elif self.nrows:
+                lines = list(islice(self.data, self.nrows))
+                lines_json = self._combine_lines(lines)
+                obj = self._get_object_parser(lines_json)
+            else:
+                data = ensure_str(self.data)
+                data_lines = data.split("\n")
+                obj = self._get_object_parser(self._combine_lines(data_lines))
+        else:
+            obj = self._get_object_parser(self.data)
+        if self.dtype_backend is not lib.no_default:
+            return obj.convert_dtypes(
+                infer_objects=False, dtype_backend=self.dtype_backend
+            )
+        else:
+            return obj
 
     def _get_object_parser(self, json: str) -> DataFrame | Series:
         """
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1,6 +1,9 @@
 import datetime
 from datetime import timedelta
-from io import StringIO
+from io import (
+    BytesIO,
+    StringIO,
+)
 import json
 import os
 import sys
@@ -2184,6 +2187,30 @@ def test_read_json_dtype_backend(
         # string_storage setting -> ignore that for checking the result
         tm.assert_frame_equal(result, expected, check_column_type=False)
 
+    @td.skip_if_no("pyarrow")
+    @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+    def test_read_json_pyarrow_with_dtype(self):
+        dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"}
+        json = b'{"a": 1, "b": 2}\n'
+
+        df = read_json(
+            BytesIO(json),
+            dtype=dtype,
+            lines=True,
+            engine="pyarrow",
+            dtype_backend="pyarrow",
+        )
+
+        result = df.dtypes
+        expected = Series(
+            data=[
+                pd.ArrowDtype.construct_from_string("int32[pyarrow]"),
+                pd.ArrowDtype.construct_from_string("int64[pyarrow]"),
+            ],
+            index=["a", "b"],
+        )
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("orient", ["split", "records", "index"])
     def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
         # GH#50750