@@ -249,6 +249,42 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE,
249249 return set ()
250250
251251
252+ async def load_docs (urls , browser_semaphore = None , ** kwargs ):
253+ """Load a document for each input URL
254+
255+ Parameters
256+ ----------
257+ urls : iterable of str
258+ Iterable of URL's (as strings) to fetch.
259+ browser_semaphore : :class:`asyncio.Semaphore`, optional
260+ Semaphore instance that can be used to limit the number of
261+ playwright browsers open concurrently for document retrieval. If
262+ ``None``, no limits are applied. By default, ``None``.
263+ kwargs
264+ Keyword-argument pairs to initialize
265+ :class:`elm.web.file_loader.AsyncFileLoader`.
266+
267+ Returns
268+ -------
269+ list
270+ List of non-empty document instances containing information from
271+ the URL's. If a URL could not be fetched (i.e. document instance
272+ is empty), it will not be included in the output list.
273+ """
274+ logger .trace ("Downloading docs for the following URL's:\n %r" , urls )
275+ logger .trace ("kwargs for AsyncFileLoader:\n %s" ,
276+ pprint .PrettyPrinter ().pformat (kwargs ))
277+ file_loader = AsyncFileLoader (browser_semaphore = browser_semaphore ,
278+ ** kwargs )
279+ docs = await file_loader .fetch_all (* urls )
280+
281+ page_lens = {doc .attrs .get ("source" , "Unknown" ): len (doc .pages )
282+ for doc in docs }
283+ logger .debug ("Loaded the following number of pages for docs:\n %s" ,
284+ pprint .PrettyPrinter ().pformat (page_lens ))
285+ return [doc for doc in docs if not doc .empty ]
286+
287+
252288async def _single_se_search (se_name , queries , num_urls , ignore_url_parts ,
253289 browser_sem , task_name , kwargs ):
254290 """Search for links using a single search engine"""
@@ -342,39 +378,3 @@ def _as_set(user_input):
342378 if isinstance (user_input , str ):
343379 user_input = {user_input }
344380 return set (user_input or [])
345-
346-
347- async def load_docs (urls , browser_semaphore = None , ** kwargs ):
348- """Load a document for each input URL
349-
350- Parameters
351- ----------
352- urls : iterable of str
353- Iterable of URL's (as strings) to fetch.
354- browser_semaphore : :class:`asyncio.Semaphore`, optional
355- Semaphore instance that can be used to limit the number of
356- playwright browsers open concurrently for document retrieval. If
357- ``None``, no limits are applied. By default, ``None``.
358- kwargs
359- Keyword-argument pairs to initialize
360- :class:`elm.web.file_loader.AsyncFileLoader`.
361-
362- Returns
363- -------
364- list
365- List of non-empty document instances containing information from
366- the URL's. If a URL could not be fetched (i.e. document instance
367- is empty), it will not be included in the output list.
368- """
369- logger .trace ("Downloading docs for the following URL's:\n %r" , urls )
370- logger .trace ("kwargs for AsyncFileLoader:\n %s" ,
371- pprint .PrettyPrinter ().pformat (kwargs ))
372- file_loader = AsyncFileLoader (browser_semaphore = browser_semaphore ,
373- ** kwargs )
374- docs = await file_loader .fetch_all (* urls )
375-
376- page_lens = {doc .attrs .get ("source" , "Unknown" ): len (doc .pages )
377- for doc in docs }
378- logger .debug ("Loaded the following number of pages for docs:\n %s" ,
379- pprint .PrettyPrinter ().pformat (page_lens ))
380- return [doc for doc in docs if not doc .empty ]
0 commit comments