@@ -1810,7 +1810,7 @@ def searchsorted(
18101810 Value(s) to insert into `self`.
18111811 side : {'left', 'right'}, optional
18121812 If 'left', the index of the first suitable location found is given.
1813- If 'right', return the last such index. If there is no suitable
1813+ If 'right', return the last such index. If there is no suitable
18141814 index, return either 0 or N (where N is the length of `self`).
18151815 sorter : 1-D array-like, optional
18161816 Optional array of integer indices that sort array a into ascending
@@ -1837,19 +1837,95 @@ def searchsorted(
18371837 "searchsorted requires array to be sorted, which is impossible "
18381838 "with NAs present."
18391839 )
1840- if isinstance (value , ExtensionArray ):
1841- value = value .astype (object )
1842- # Base class searchsorted would cast to object, which is *much* slower.
1840+
1841+ if sorter is not None :
1842+ dtype = None
1843+
1844+ if isinstance (self .dtype , ArrowDtype ):
1845+ pa_dtype = self .dtype .pyarrow_dtype
1846+
1847+ if (
1848+ pa .types .is_timestamp (pa_dtype ) or pa .types .is_duration (pa_dtype )
1849+ ) and pa_dtype .unit == "ns" :
1850+ dtype = object
1851+
1852+ return self .to_numpy (dtype = dtype ).searchsorted (
1853+ value ,
1854+ side = side ,
1855+ sorter = sorter ,
1856+ )
1857+
1858+ arr = self ._pa_array .combine_chunks ()
1859+ pa_dtype = arr .type
1860+
1861+ # Fast Arrow-native path for strings
1862+ if pa .types .is_string (pa_dtype ) or pa .types .is_large_string (pa_dtype ):
1863+ offsets_buf = arr .buffers ()[1 ]
1864+ data_buf = arr .buffers ()[2 ]
1865+
1866+ offset_dtype = np .int64 if pa .types .is_large_string (pa_dtype ) else np .int32
1867+
1868+ offsets = np .frombuffer (offsets_buf , dtype = offset_dtype )
1869+ data = memoryview (data_buf )
1870+
1871+ def get_string (i : int ) -> bytes :
1872+ start = offsets [i ]
1873+ end = offsets [i + 1 ]
1874+ return data [start :end ].tobytes ()
1875+
1876+ def binary_search (target , side_local ):
1877+ if isinstance (target , str ):
1878+ target = target .encode ()
1879+ elif not isinstance (target , bytes ):
1880+ target = str (target ).encode ()
1881+
1882+ left = 0
1883+ right = len (arr )
1884+
1885+ while left < right :
1886+ mid = (left + right ) // 2
1887+ mid_value = get_string (mid )
1888+
1889+ if side_local == "left" :
1890+ if mid_value < target :
1891+ left = mid + 1
1892+ else :
1893+ right = mid
1894+ elif mid_value <= target :
1895+ left = mid + 1
1896+ else :
1897+ right = mid
1898+
1899+ return left
1900+
1901+ # scalar input
1902+ if is_scalar (value ):
1903+ return np .intp (binary_search (value , side ))
1904+
1905+ # vector input
1906+ result = np .empty (len (value ), dtype = np .intp )
1907+
1908+ for i , val in enumerate (value ):
1909+ result [i ] = binary_search (val , side )
1910+
1911+ return result
1912+
1913+ # Fallback for non-string dtypes
18431914 dtype = None
1915+
18441916 if isinstance (self .dtype , ArrowDtype ):
18451917 pa_dtype = self .dtype .pyarrow_dtype
1918+
18461919 if (
18471920 pa .types .is_timestamp (pa_dtype ) or pa .types .is_duration (pa_dtype )
18481921 ) and pa_dtype .unit == "ns" :
1849- # np.array[datetime/timedelta].searchsorted(datetime/timedelta)
1850- # erroneously fails when numpy type resolution is nanoseconds
18511922 dtype = object
1852- return self .to_numpy (dtype = dtype ).searchsorted (value , side = side , sorter = sorter )
1923+
1924+ return self .to_numpy (dtype = dtype ).searchsorted (
1925+ value ,
1926+ side = side ,
1927+ sorter = sorter ,
1928+ )
18531929
18541930 def take (
18551931 self ,
0 commit comments