Skip to content

Commit af6296b

Browse files
authored
Merge pull request #55 from NREL/pp/minor_updates
Minor misc. updates
2 parents 67a0259 + cf3f022 commit af6296b

13 files changed

Lines changed: 190 additions & 66 deletions

File tree

.github/workflows/pytest_ords.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,14 @@ jobs:
3939
python -m pip install --upgrade pip
4040
python -m pip install pdftotext
4141
python -m pip install pytest
42+
python -m pip install pytest-asyncio
4243
python -m pip install pytest-mock
4344
python -m pip install pytest-cov
4445
python -m pip install psycopg2-binary
4546
python -m pip install boto3
4647
python -m pip install flaky
4748
python -m pip install .
48-
rebrowser_playwright install
49+
rebrowser_playwright install --with-deps
4950
- name: Run pytest and Generate coverage report
5051
shell: bash -l {0}
5152
run: |

elm/utilities/parse.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,12 @@ def _load_pdf_possibly_multi_col(pdf_bytes):
397397
return pages
398398

399399

400-
def read_pdf_ocr(pdf_bytes, verbose=True): # pragma: no cover
400+
def read_pdf_ocr( # pragma: no cover
401+
pdf_bytes,
402+
image_to_string_kwargs=None,
403+
convert_from_bytes_kwargs=None,
404+
verbose=True
405+
):
401406
"""Read PDF contents from bytes using Optical Character recognition (OCR).
402407
403408
This method attempt to read the PDF document using OCR. This is one
@@ -417,6 +422,14 @@ def read_pdf_ocr(pdf_bytes, verbose=True): # pragma: no cover
417422
----------
418423
pdf_bytes : bytes
419424
Bytes corresponding to a PDF file.
425+
image_to_string_kwargs : dictionary, optional
426+
Optional dictionary of keyword-value pairs to pass as arguments
427+
to the :func:`pytesseract.image_to_string` function.
428+
By default, ``None``.
429+
convert_from_bytes_kwargs : dictionary, optional
430+
Optional dictionary of keyword-value pairs to pass as arguments
431+
to the :func:`pdf2image.convert_from_bytes` function.
432+
By default, ``None``.
420433
verbose : bool, optional
421434
Option to log errors during parsing. By default, ``True``.
422435
@@ -427,7 +440,11 @@ def read_pdf_ocr(pdf_bytes, verbose=True): # pragma: no cover
427440
may be empty if there was an error reading the PDF file.
428441
"""
429442
try:
430-
pages = _load_pdf_with_pytesseract(pdf_bytes)
443+
pages = _load_pdf_with_pytesseract(
444+
pdf_bytes,
445+
image_to_string_kwargs=image_to_string_kwargs,
446+
convert_from_bytes_kwargs=convert_from_bytes_kwargs
447+
)
431448
except Exception as e:
432449
if verbose:
433450
logger.error("Failed to decode PDF content!")
@@ -437,7 +454,9 @@ def read_pdf_ocr(pdf_bytes, verbose=True): # pragma: no cover
437454
return pages
438455

439456

440-
def _load_pdf_with_pytesseract(pdf_bytes): # pragma: no cover
457+
def _load_pdf_with_pytesseract( # pragma: no cover
458+
pdf_bytes, image_to_string_kwargs=None, convert_from_bytes_kwargs=None
459+
):
441460
"""Load PDF bytes using Optical Character recognition (OCR)"""
442461

443462
try:
@@ -469,7 +488,12 @@ def _load_pdf_with_pytesseract(pdf_bytes): # pragma: no cover
469488
pytesseract.pytesseract.tesseract_cmd,
470489
)
471490

491+
its = {"timeout": 60 * 5}
492+
its.update(image_to_string_kwargs or {})
493+
494+
cfb = {"grayscale": True}
495+
cfb.update(convert_from_bytes_kwargs or {})
472496
return [
473-
str(pytesseract.image_to_string(page_data).encode("utf-8"))
474-
for page_data in convert_from_bytes(bytes(pdf_bytes))
497+
str(pytesseract.image_to_string(page_data, **its).encode("utf-8"))
498+
for page_data in convert_from_bytes(bytes(pdf_bytes), **cfb)
475499
]

elm/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
ELM version number
33
"""
44

5-
__version__ = "0.0.13"
5+
__version__ = "0.0.14"

elm/web/file_loader.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class AsyncFileLoader:
4646
.. end desc
4747
"""
4848

49-
PAGE_LOAD_TIMEOUT = 90_000
49+
PAGE_LOAD_TIMEOUT = 60_000
5050
"""Default page load timeout value in milliseconds"""
5151

5252
def __init__(
@@ -192,6 +192,7 @@ async def _fetch_doc(self, url):
192192
logger.trace("Fetching content from %r", url)
193193
url_bytes = await self._fetch_content_with_retry(url, session)
194194
except ELMRuntimeError:
195+
logger.exception("Could not fetch content from %r", url)
195196
return PDFDocument(pages=[]), None
196197

197198
logger.trace("Got content from %r", url)

elm/web/html_pw.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
TimeoutError as PlaywrightTimeoutError
1313
)
1414

15-
from elm.web.utilities import pw_page
15+
from elm.web.utilities import pw_page, PWKwargs
1616

1717

1818
logger = logging.getLogger(__name__)
@@ -62,13 +62,16 @@ async def _load_html( # pragma: no cover
6262
if browser_semaphore is None:
6363
browser_semaphore = AsyncExitStack()
6464

65+
launch_kwargs = PWKwargs.launch_kwargs()
66+
launch_kwargs.update(pw_launch_kwargs)
67+
6568
logger.trace("Loading HTML using playwright")
6669
async with async_playwright() as p, browser_semaphore:
6770
logger.trace("launching chromium; browser_semaphore=%r",
6871
browser_semaphore)
69-
browser = await p.chromium.launch(**pw_launch_kwargs)
72+
browser = await p.chromium.launch(**launch_kwargs)
7073
page_kwargs = {"browser": browser, "intercept_routes": True,
71-
"ignore_https_errors": True} # no sensitive inputs
74+
"timeout": timeout, "ignore_https_errors": True}
7275
async with pw_page(**page_kwargs) as page:
7376
logger.trace("Navigating to: %r", url)
7477
await page.goto(url)

elm/web/search/base.py

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
)
1212
from playwright_stealth import StealthConfig
1313

14-
from elm.web.utilities import clean_search_query, pw_page
14+
from elm.web.utilities import PWKwargs, clean_search_query, pw_page
1515

1616

1717
logger = logging.getLogger(__name__)
@@ -34,17 +34,21 @@ async def results(self, *queries, num_results=10):
3434
*queries : str
3535
One or more queries to search for.
3636
num_results : int, optional
37-
Number of top results to retrieve for each query. Note that
38-
this value can never exceed the number of results per page
39-
(typically 10). If you pass in a larger value, it will be
40-
reduced to the number of results per page.
41-
By default, ``10``.
37+
Maximum number of top results to retrieve for each query.
38+
Note that this value can never exceed the number of results
39+
per page (typically 10). If you pass in a larger value, it
40+
will be reduced to the number of results per page. There is
41+
also no guarantee that the search query will return this
42+
many results - the actual number of results returned is
43+
determined by the number of results on a page (excluding
44+
ads). You can, however, use this input to limit the number
45+
of results returned. By default, ``10``.
4246
4347
Returns
4448
-------
4549
list
4650
List equal to the length of the input queries, where each
47-
entry is another list containing the top `num_results`
51+
entry is another list containing no more than `num_results`
4852
links.
4953
"""
5054
queries = map(clean_search_query, queries)
@@ -71,8 +75,8 @@ async def _skip_exc_search(self, query, num_results=10):
7175
"""Perform search while ignoring errors"""
7276
try:
7377
return await self._search(query, num_results=num_results)
74-
except self._EXCEPTION_TO_CATCH as e:
75-
logger.exception(e)
78+
except self._EXCEPTION_TO_CATCH:
79+
logger.exception("Could not complete search for query=%r", query)
7680
return []
7781

7882
@abstractmethod
@@ -87,7 +91,7 @@ class PlaywrightSearchEngineLinkSearch(SearchEngineLinkSearch):
8791
MAX_RESULTS_CONSIDERED_PER_PAGE = 10
8892
"""Number of results considered per search engine page"""
8993

90-
PAGE_LOAD_TIMEOUT = 90_000
94+
PAGE_LOAD_TIMEOUT = 60_000
9195
"""Default page load timeout value in milliseconds"""
9296

9397
_SC = StealthConfig(navigator_user_agent=False)
@@ -104,7 +108,8 @@ def __init__(self, **launch_kwargs):
104108
``headless=False, slow_mo=50`` for a visualization of the
105109
search.
106110
"""
107-
self.launch_kwargs = launch_kwargs
111+
self.launch_kwargs = PWKwargs.launch_kwargs()
112+
self.launch_kwargs.update(launch_kwargs)
108113
self._browser = None
109114

110115
async def _load_browser(self, pw_instance):
@@ -123,15 +128,16 @@ async def _search(self, query, num_results=10):
123128
num_results = min(num_results, self.MAX_RESULTS_CONSIDERED_PER_PAGE)
124129

125130
page_kwargs = {"browser": self._browser, "stealth_config": self._SC,
126-
"ignore_https_errors": True} # no sensitive inputs
131+
"ignore_https_errors": True, # no sensitive inputs
132+
"timeout": self.PAGE_LOAD_TIMEOUT}
127133
async with pw_page(**page_kwargs) as page:
128134
await _navigate_to_search_engine(page, se_url=self._SE_URL,
129135
timeout=self.PAGE_LOAD_TIMEOUT)
130136
logger.trace("Performing %s search for query: %r", self._SE_NAME,
131137
query)
132138
await self._perform_search(page, query)
133139
logger.trace("Extracting links for query: %r", query)
134-
return await self._extract_links(page, num_results)
140+
return await self._extract_links(page, num_results, query)
135141

136142
async def _get_links(self, queries, num_results):
137143
"""Get links for multiple queries"""
@@ -152,12 +158,29 @@ async def _get_links(self, queries, num_results):
152158
await self._close_browser()
153159
return results
154160

155-
async def _extract_links(self, page, num_results):
161+
async def _extract_links(self, page, num_results, query):
156162
"""Extract links for top `num_results` on page"""
157-
links = await asyncio.to_thread(page.locator, self._SE_SR_TAG)
158-
159-
return [await links.nth(i).get_attribute("href")
160-
for i in range(num_results)]
163+
await page.wait_for_load_state("networkidle",
164+
timeout=self.PAGE_LOAD_TIMEOUT)
165+
await page.wait_for_selector(self._SE_SR_TAG)
166+
locator = page.locator(self._SE_SR_TAG)
167+
count = await locator.count()
168+
links = []
169+
170+
for i in range(count):
171+
element = locator.nth(i)
172+
try:
173+
link = await element.get_attribute("href")
174+
if link is not None:
175+
links.append(link)
176+
except Exception:
177+
logger.exception("Skipped extracting link %d for query %r",
178+
i, query)
179+
180+
if len(links) >= num_results:
181+
break
182+
183+
return links
161184

162185
@property
163186
@abstractmethod

elm/web/search/google.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
"""ELM Web Scraping - Google search."""
33
import os
44
import json
5-
import asyncio
65
import logging
76
import requests
87

@@ -115,11 +114,30 @@ async def _perform_search(self, page, search_query):
115114
logger.trace("Hitting enter for query: %r", search_query)
116115
await page.keyboard.press('Enter')
117116

118-
async def _extract_links(self, page, num_results):
117+
async def _extract_links(self, page, num_results, query):
119118
"""Extract links for top `num_results` on page"""
120-
links = await asyncio.to_thread(page.locator, self._SE_SR_TAG)
121-
return [await links.nth(i * 2).get_attribute("href")
122-
for i in range(num_results)]
119+
await page.wait_for_load_state("networkidle",
120+
timeout=self.PAGE_LOAD_TIMEOUT)
121+
await page.wait_for_selector(self._SE_SR_TAG)
122+
locator = page.locator(self._SE_SR_TAG)
123+
124+
count = await locator.count() // 2
125+
links = []
126+
127+
for i in range(count):
128+
element = locator.nth(i * 2)
129+
try:
130+
link = await element.get_attribute("href")
131+
if link is not None:
132+
links.append(link)
133+
except Exception:
134+
logger.exception("Skipped extracting link %d for query %r",
135+
i, query)
136+
137+
if len(links) >= num_results:
138+
break
139+
140+
return links
123141

124142

125143
class APIGoogleCSESearch(APISearchEngineLinkSearch):

0 commit comments

Comments
 (0)