NatLabRockies
diff --git a/‎.github/workflows/pytest_ords.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/pytest_ords.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎elm/utilities/parse.py‎
Lines changed: 29 additions & 5 deletions b/‎elm/utilities/parse.py‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎elm/version.py‎
Lines changed: 1 addition & 1 deletion b/‎elm/version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎elm/web/file_loader.py‎
Lines changed: 2 additions & 1 deletion b/‎elm/web/file_loader.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎elm/web/html_pw.py‎
Lines changed: 6 additions & 3 deletions b/‎elm/web/html_pw.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎elm/web/search/base.py‎
Lines changed: 41 additions & 18 deletions b/‎elm/web/search/base.py‎
Lines changed: 41 additions & 18 deletions
diff --git a/‎elm/web/search/google.py‎
Lines changed: 23 additions & 5 deletions b/‎elm/web/search/google.py‎
Lines changed: 23 additions & 5 deletions
@@ -39,13 +39,14 @@ jobs:
         python -m pip install --upgrade pip
         python -m pip install pdftotext
         python -m pip install pytest
+        python -m pip install pytest-asyncio
         python -m pip install pytest-mock
         python -m pip install pytest-cov
         python -m pip install psycopg2-binary
         python -m pip install boto3
         python -m pip install flaky
         python -m pip install .
-        rebrowser_playwright install
+        rebrowser_playwright install --with-deps
     - name: Run pytest and Generate coverage report
       shell: bash -l {0}
       run: |
 
@@ -397,7 +397,12 @@ def _load_pdf_possibly_multi_col(pdf_bytes):
     return pages
 
 
-def read_pdf_ocr(pdf_bytes, verbose=True):  # pragma: no cover
+def read_pdf_ocr(  # pragma: no cover
+    pdf_bytes,
+    image_to_string_kwargs=None,
+    convert_from_bytes_kwargs=None,
+    verbose=True
+):
     """Read PDF contents from bytes using Optical Character recognition (OCR).
 
     This method attempt to read the PDF document using OCR. This is one
@@ -417,6 +422,14 @@ def read_pdf_ocr(pdf_bytes, verbose=True):  # pragma: no cover
     ----------
     pdf_bytes : bytes
         Bytes corresponding to a PDF file.
+    image_to_string_kwargs : dictionary, optional
+        Optional dictionary of keyword-value pairs to pass as arguments
+        to the :func:`pytesseract.image_to_string` function.
+        By default, ``None``.
+    convert_from_bytes_kwargs : dictionary, optional
+        Optional dictionary of keyword-value pairs to pass as arguments
+        to the :func:`pdf2image.convert_from_bytes` function.
+        By default, ``None``.
     verbose : bool, optional
         Option to log errors during parsing. By default, ``True``.
 
@@ -427,7 +440,11 @@ def read_pdf_ocr(pdf_bytes, verbose=True):  # pragma: no cover
         may be empty if there was an error reading the PDF file.
     """
     try:
-        pages = _load_pdf_with_pytesseract(pdf_bytes)
+        pages = _load_pdf_with_pytesseract(
+            pdf_bytes,
+            image_to_string_kwargs=image_to_string_kwargs,
+            convert_from_bytes_kwargs=convert_from_bytes_kwargs
+        )
     except Exception as e:
         if verbose:
             logger.error("Failed to decode PDF content!")
@@ -437,7 +454,9 @@ def read_pdf_ocr(pdf_bytes, verbose=True):  # pragma: no cover
     return pages
 
 
-def _load_pdf_with_pytesseract(pdf_bytes):  # pragma: no cover
+def _load_pdf_with_pytesseract(  # pragma: no cover
+    pdf_bytes, image_to_string_kwargs=None, convert_from_bytes_kwargs=None
+):
     """Load PDF bytes using Optical Character recognition (OCR)"""
 
     try:
@@ -469,7 +488,12 @@ def _load_pdf_with_pytesseract(pdf_bytes):  # pragma: no cover
         pytesseract.pytesseract.tesseract_cmd,
     )
 
+    its = {"timeout": 60 * 5}
+    its.update(image_to_string_kwargs or {})
+
+    cfb = {"grayscale": True}
+    cfb.update(convert_from_bytes_kwargs or {})
     return [
-        str(pytesseract.image_to_string(page_data).encode("utf-8"))
-        for page_data in convert_from_bytes(bytes(pdf_bytes))
+        str(pytesseract.image_to_string(page_data, **its).encode("utf-8"))
+        for page_data in convert_from_bytes(bytes(pdf_bytes), **cfb)
     ]
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.13"
+__version__ = "0.0.14"
@@ -46,7 +46,7 @@ class AsyncFileLoader:
     .. end desc
     """
 
-    PAGE_LOAD_TIMEOUT = 90_000
+    PAGE_LOAD_TIMEOUT = 60_000
     """Default page load timeout value in milliseconds"""
 
     def __init__(
@@ -192,6 +192,7 @@ async def _fetch_doc(self, url):
                 logger.trace("Fetching content from %r", url)
                 url_bytes = await self._fetch_content_with_retry(url, session)
             except ELMRuntimeError:
+                logger.exception("Could not fetch content from %r", url)
                 return PDFDocument(pages=[]), None
 
         logger.trace("Got content from %r", url)
 
@@ -12,7 +12,7 @@
     TimeoutError as PlaywrightTimeoutError
 )
 
-from elm.web.utilities import pw_page
+from elm.web.utilities import pw_page, PWKwargs
 
 
 logger = logging.getLogger(__name__)
@@ -62,13 +62,16 @@ async def _load_html(  # pragma: no cover
     if browser_semaphore is None:
         browser_semaphore = AsyncExitStack()
 
+    launch_kwargs = PWKwargs.launch_kwargs()
+    launch_kwargs.update(pw_launch_kwargs)
+
     logger.trace("Loading HTML using playwright")
     async with async_playwright() as p, browser_semaphore:
         logger.trace("launching chromium; browser_semaphore=%r",
                      browser_semaphore)
-        browser = await p.chromium.launch(**pw_launch_kwargs)
+        browser = await p.chromium.launch(**launch_kwargs)
         page_kwargs = {"browser": browser, "intercept_routes": True,
-                       "ignore_https_errors": True}  # no sensitive inputs
+                       "timeout": timeout, "ignore_https_errors": True}
         async with pw_page(**page_kwargs) as page:
             logger.trace("Navigating to: %r", url)
             await page.goto(url)
 
@@ -11,7 +11,7 @@
 )
 from playwright_stealth import StealthConfig
 
-from elm.web.utilities import clean_search_query, pw_page
+from elm.web.utilities import PWKwargs, clean_search_query, pw_page
 
 
 logger = logging.getLogger(__name__)
@@ -34,17 +34,21 @@ async def results(self, *queries, num_results=10):
         *queries : str
             One or more queries to search for.
         num_results : int, optional
-            Number of top results to retrieve for each query. Note that
-            this value can never exceed the number of results per page
-            (typically 10). If you pass in a larger value, it will be
-            reduced to the number of results per page.
-            By default, ``10``.
+            Maximum number of top results to retrieve for each query.
+            Note that this value can never exceed the number of results
+            per page (typically 10). If you pass in a larger value, it
+            will be reduced to the number of results per page. There is
+            also no guarantee that the search query will return this
+            many results - the actual number of results returned is
+            determined by the number of results on a page (excluding
+            ads). You can, however, use this input to limit the number
+            of results returned. By default, ``10``.
 
         Returns
         -------
         list
             List equal to the length of the input queries, where each
-            entry is another list containing the top `num_results`
+            entry is another list containing no more than `num_results`
             links.
         """
         queries = map(clean_search_query, queries)
@@ -71,8 +75,8 @@ async def _skip_exc_search(self, query, num_results=10):
         """Perform search while ignoring errors"""
         try:
             return await self._search(query, num_results=num_results)
-        except self._EXCEPTION_TO_CATCH as e:
-            logger.exception(e)
+        except self._EXCEPTION_TO_CATCH:
+            logger.exception("Could not complete search for query=%r", query)
             return []
 
     @abstractmethod
@@ -87,7 +91,7 @@ class PlaywrightSearchEngineLinkSearch(SearchEngineLinkSearch):
     MAX_RESULTS_CONSIDERED_PER_PAGE = 10
     """Number of results considered per search engine page"""
 
-    PAGE_LOAD_TIMEOUT = 90_000
+    PAGE_LOAD_TIMEOUT = 60_000
     """Default page load timeout value in milliseconds"""
 
     _SC = StealthConfig(navigator_user_agent=False)
@@ -104,7 +108,8 @@ def __init__(self, **launch_kwargs):
             ``headless=False, slow_mo=50`` for a visualization of the
             search.
         """
-        self.launch_kwargs = launch_kwargs
+        self.launch_kwargs = PWKwargs.launch_kwargs()
+        self.launch_kwargs.update(launch_kwargs)
         self._browser = None
 
     async def _load_browser(self, pw_instance):
@@ -123,15 +128,16 @@ async def _search(self, query, num_results=10):
         num_results = min(num_results, self.MAX_RESULTS_CONSIDERED_PER_PAGE)
 
         page_kwargs = {"browser": self._browser, "stealth_config": self._SC,
-                       "ignore_https_errors": True}  # no sensitive inputs
+                       "ignore_https_errors": True,  # no sensitive inputs
+                       "timeout": self.PAGE_LOAD_TIMEOUT}
         async with pw_page(**page_kwargs) as page:
             await _navigate_to_search_engine(page, se_url=self._SE_URL,
                                              timeout=self.PAGE_LOAD_TIMEOUT)
             logger.trace("Performing %s search for query: %r", self._SE_NAME,
                          query)
             await self._perform_search(page, query)
             logger.trace("Extracting links for query: %r", query)
-            return await self._extract_links(page, num_results)
+            return await self._extract_links(page, num_results, query)
 
     async def _get_links(self, queries, num_results):
         """Get links for multiple queries"""
@@ -152,12 +158,29 @@ async def _get_links(self, queries, num_results):
             await self._close_browser()
         return results
 
-    async def _extract_links(self, page, num_results):
+    async def _extract_links(self, page, num_results, query):
         """Extract links for top `num_results` on page"""
-        links = await asyncio.to_thread(page.locator, self._SE_SR_TAG)
-
-        return [await links.nth(i).get_attribute("href")
-                for i in range(num_results)]
+        await page.wait_for_load_state("networkidle",
+                                       timeout=self.PAGE_LOAD_TIMEOUT)
+        await page.wait_for_selector(self._SE_SR_TAG)
+        locator = page.locator(self._SE_SR_TAG)
+        count = await locator.count()
+        links = []
+
+        for i in range(count):
+            element = locator.nth(i)
+            try:
+                link = await element.get_attribute("href")
+                if link is not None:
+                    links.append(link)
+            except Exception:
+                logger.exception("Skipped extracting link %d for query %r",
+                                 i, query)
+
+            if len(links) >= num_results:
+                break
+
+        return links
 
     @property
     @abstractmethod
 
@@ -2,7 +2,6 @@
 """ELM Web Scraping - Google search."""
 import os
 import json
-import asyncio
 import logging
 import requests
 
@@ -115,11 +114,30 @@ async def _perform_search(self, page, search_query):
         logger.trace("Hitting enter for query: %r", search_query)
         await page.keyboard.press('Enter')
 
-    async def _extract_links(self, page, num_results):
+    async def _extract_links(self, page, num_results, query):
         """Extract links for top `num_results` on page"""
-        links = await asyncio.to_thread(page.locator, self._SE_SR_TAG)
-        return [await links.nth(i * 2).get_attribute("href")
-                for i in range(num_results)]
+        await page.wait_for_load_state("networkidle",
+                                       timeout=self.PAGE_LOAD_TIMEOUT)
+        await page.wait_for_selector(self._SE_SR_TAG)
+        locator = page.locator(self._SE_SR_TAG)
+
+        count = await locator.count() // 2
+        links = []
+
+        for i in range(count):
+            element = locator.nth(i * 2)
+            try:
+                link = await element.get_attribute("href")
+                if link is not None:
+                    links.append(link)
+            except Exception:
+                logger.exception("Skipped extracting link %d for query %r",
+                                 i, query)
+
+            if len(links) >= num_results:
+                break
+
+        return links
 
 
 class APIGoogleCSESearch(APISearchEngineLinkSearch):