Skip to content

Commit 0ad534c

Browse files
Solved the TC Issue raised
1 parent 04172af commit 0ad534c

1 file changed

Lines changed: 83 additions & 7 deletions

File tree

pandas/core/arrays/arrow/array.py

Lines changed: 83 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1810,7 +1810,7 @@ def searchsorted(
18101810
Value(s) to insert into `self`.
18111811
side : {'left', 'right'}, optional
18121812
If 'left', the index of the first suitable location found is given.
1813-
If 'right', return the last such index. If there is no suitable
1813+
If 'right', return the last such index. If there is no suitable
18141814
index, return either 0 or N (where N is the length of `self`).
18151815
sorter : 1-D array-like, optional
18161816
Optional array of integer indices that sort array a into ascending
@@ -1837,19 +1837,95 @@ def searchsorted(
18371837
"searchsorted requires array to be sorted, which is impossible "
18381838
"with NAs present."
18391839
)
1840-
if isinstance(value, ExtensionArray):
1841-
value = value.astype(object)
1842-
# Base class searchsorted would cast to object, which is *much* slower.
1840+
1841+
if sorter is not None:
1842+
dtype = None
1843+
1844+
if isinstance(self.dtype, ArrowDtype):
1845+
pa_dtype = self.dtype.pyarrow_dtype
1846+
1847+
if (
1848+
pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype)
1849+
) and pa_dtype.unit == "ns":
1850+
dtype = object
1851+
1852+
return self.to_numpy(dtype=dtype).searchsorted(
1853+
value,
1854+
side=side,
1855+
sorter=sorter,
1856+
)
1857+
1858+
arr = self._pa_array.combine_chunks()
1859+
pa_dtype = arr.type
1860+
1861+
# Fast Arrow-native path for strings
1862+
if pa.types.is_string(pa_dtype) or pa.types.is_large_string(pa_dtype):
1863+
offsets_buf = arr.buffers()[1]
1864+
data_buf = arr.buffers()[2]
1865+
1866+
offset_dtype = np.int64 if pa.types.is_large_string(pa_dtype) else np.int32
1867+
1868+
offsets = np.frombuffer(offsets_buf, dtype=offset_dtype)
1869+
data = memoryview(data_buf)
1870+
1871+
def get_string(i: int) -> bytes:
1872+
start = offsets[i]
1873+
end = offsets[i + 1]
1874+
return data[start:end].tobytes()
1875+
1876+
def binary_search(target, side_local):
1877+
if isinstance(target, str):
1878+
target = target.encode()
1879+
elif not isinstance(target, bytes):
1880+
target = str(target).encode()
1881+
1882+
left = 0
1883+
right = len(arr)
1884+
1885+
while left < right:
1886+
mid = (left + right) // 2
1887+
mid_value = get_string(mid)
1888+
1889+
if side_local == "left":
1890+
if mid_value < target:
1891+
left = mid + 1
1892+
else:
1893+
right = mid
1894+
elif mid_value <= target:
1895+
left = mid + 1
1896+
else:
1897+
right = mid
1898+
1899+
return left
1900+
1901+
# scalar input
1902+
if is_scalar(value):
1903+
return np.intp(binary_search(value, side))
1904+
1905+
# vector input
1906+
result = np.empty(len(value), dtype=np.intp)
1907+
1908+
for i, val in enumerate(value):
1909+
result[i] = binary_search(val, side)
1910+
1911+
return result
1912+
1913+
# Fallback for non-string dtypes
18431914
dtype = None
1915+
18441916
if isinstance(self.dtype, ArrowDtype):
18451917
pa_dtype = self.dtype.pyarrow_dtype
1918+
18461919
if (
18471920
pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype)
18481921
) and pa_dtype.unit == "ns":
1849-
# np.array[datetime/timedelta].searchsorted(datetime/timedelta)
1850-
# erroneously fails when numpy type resolution is nanoseconds
18511922
dtype = object
1852-
return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter)
1923+
1924+
return self.to_numpy(dtype=dtype).searchsorted(
1925+
value,
1926+
side=side,
1927+
sorter=sorter,
1928+
)
18531929

18541930
def take(
18551931
self,

0 commit comments

Comments
 (0)