1111)
1212from playwright_stealth import StealthConfig
1313
14- from elm .web .utilities import clean_search_query , pw_page
14+ from elm .web .utilities import PWKwargs , clean_search_query , pw_page
1515
1616
1717logger = logging .getLogger (__name__ )
@@ -34,17 +34,21 @@ async def results(self, *queries, num_results=10):
3434 *queries : str
3535 One or more queries to search for.
3636 num_results : int, optional
37- Number of top results to retrieve for each query. Note that
38- this value can never exceed the number of results per page
39- (typically 10). If you pass in a larger value, it will be
40- reduced to the number of results per page.
41- By default, ``10``.
37+ Maximum number of top results to retrieve for each query.
38+ Note that this value can never exceed the number of results
39+ per page (typically 10). If you pass in a larger value, it
40+ will be reduced to the number of results per page. There is
41+ also no guarantee that the search query will return this
42+ many results - the actual number of results returned is
43+ determined by the number of results on a page (excluding
44+ ads). You can, however, use this input to limit the number
45+ of results returned. By default, ``10``.
4246
4347 Returns
4448 -------
4549 list
4650 List equal to the length of the input queries, where each
47- entry is another list containing the top `num_results`
51+ entry is another list containing no more than `num_results`
4852 links.
4953 """
5054 queries = map (clean_search_query , queries )
@@ -71,8 +75,8 @@ async def _skip_exc_search(self, query, num_results=10):
7175 """Perform search while ignoring errors"""
7276 try :
7377 return await self ._search (query , num_results = num_results )
74- except self ._EXCEPTION_TO_CATCH as e :
75- logger .exception (e )
78+ except self ._EXCEPTION_TO_CATCH :
79+ logger .exception ("Could not complete search for query=%r" , query )
7680 return []
7781
7882 @abstractmethod
@@ -87,7 +91,7 @@ class PlaywrightSearchEngineLinkSearch(SearchEngineLinkSearch):
8791 MAX_RESULTS_CONSIDERED_PER_PAGE = 10
8892 """Number of results considered per search engine page"""
8993
90- PAGE_LOAD_TIMEOUT = 90_000
94+ PAGE_LOAD_TIMEOUT = 60_000
9195 """Default page load timeout value in milliseconds"""
9296
9397 _SC = StealthConfig (navigator_user_agent = False )
@@ -104,7 +108,8 @@ def __init__(self, **launch_kwargs):
104108 ``headless=False, slow_mo=50`` for a visualization of the
105109 search.
106110 """
107- self .launch_kwargs = launch_kwargs
111+ self .launch_kwargs = PWKwargs .launch_kwargs ()
112+ self .launch_kwargs .update (launch_kwargs )
108113 self ._browser = None
109114
110115 async def _load_browser (self , pw_instance ):
@@ -123,15 +128,16 @@ async def _search(self, query, num_results=10):
123128 num_results = min (num_results , self .MAX_RESULTS_CONSIDERED_PER_PAGE )
124129
125130 page_kwargs = {"browser" : self ._browser , "stealth_config" : self ._SC ,
126- "ignore_https_errors" : True } # no sensitive inputs
131+ "ignore_https_errors" : True , # no sensitive inputs
132+ "timeout" : self .PAGE_LOAD_TIMEOUT }
127133 async with pw_page (** page_kwargs ) as page :
128134 await _navigate_to_search_engine (page , se_url = self ._SE_URL ,
129135 timeout = self .PAGE_LOAD_TIMEOUT )
130136 logger .trace ("Performing %s search for query: %r" , self ._SE_NAME ,
131137 query )
132138 await self ._perform_search (page , query )
133139 logger .trace ("Extracting links for query: %r" , query )
134- return await self ._extract_links (page , num_results )
140+ return await self ._extract_links (page , num_results , query )
135141
136142 async def _get_links (self , queries , num_results ):
137143 """Get links for multiple queries"""
@@ -152,12 +158,29 @@ async def _get_links(self, queries, num_results):
152158 await self ._close_browser ()
153159 return results
154160
155- async def _extract_links (self , page , num_results ):
161+ async def _extract_links (self , page , num_results , query ):
156162 """Extract links for top `num_results` on page"""
157- links = await asyncio .to_thread (page .locator , self ._SE_SR_TAG )
158-
159- return [await links .nth (i ).get_attribute ("href" )
160- for i in range (num_results )]
163+ await page .wait_for_load_state ("networkidle" ,
164+ timeout = self .PAGE_LOAD_TIMEOUT )
165+ await page .wait_for_selector (self ._SE_SR_TAG )
166+ locator = page .locator (self ._SE_SR_TAG )
167+ count = await locator .count ()
168+ links = []
169+
170+ for i in range (count ):
171+ element = locator .nth (i )
172+ try :
173+ link = await element .get_attribute ("href" )
174+ if link is not None :
175+ links .append (link )
176+ except Exception :
177+ logger .exception ("Skipped extracting link %d for query %r" ,
178+ i , query )
179+
180+ if len (links ) >= num_results :
181+ break
182+
183+ return links
161184
162185 @property
163186 @abstractmethod
0 commit comments