Skip to content

Commit 920b25b

Browse files
committed
update the new functions and ready for the new version of v2.0.6
1 parent 660daec commit 920b25b

File tree

6 files changed

+185
-109
lines changed

6 files changed

+185
-109
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ $ pip install preparser
3030
3131
## parameters
3232

33-
here below are some of the parameters you can use for initai the Object `PreParser` from the package `preparser`:
33+
here below are some of the parameters you can use for initial the Object `PreParser` from the package `preparser`:
3434

3535

3636
| Parameters | Type | Description |
@@ -44,7 +44,7 @@ here below are some of the parameters you can use for initai the Object `PrePars
4444
| stop_when_task_failed | bool | wheather need stop when you failed to get request from a Url,default is `True` |
4545
| threading_numbers | int | The maximum number of threads in the threading pool. Default is `3`. |
4646
| checked_same_site | bool | wheather need add more headers info to pretend requesting in a same site to parse datas, default is `True`,to resolve the `CORS` Block. |
47-
47+
| html_dynamic_scope | list or None | point and get the specied scope dom of the whole page html, default is None,which stands for the whole page.<br />if this value was set, the parameter should be a list(2) Object. <br/> 1. the first value is a tag <a href="https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector"> selecter</a>. <br /> for example, 'div#main' mean a div tag with 'id=main', 'div.test' will get the the first matched div tag with 'class = test'. but don't make the selecter too complex or matched the mutiple parent dom, otherwise you can't get their inner_html() correctly or time out, and finally you can get the BeautifulSoup object of the inner_html from this selecter selected tag in the `request_call_back_func`. <br /> 2. the secound value should be one of the values below: <br />`attached`: wait for element to be present in DOM. <br />`detached`: wait for element to not be present in DOM. <br />`hidden`: wait for element to have non-empty bounding box and no 'visibility:hidden'. Note that element,without any content or with 'display:none' has an empty bounding box and is not considered visible. <br /> `visible`: wait for element to be either detached from DOM, or have an empty bounding box or 'visibility:hidden'. This is opposite to the 'visible' option.
4848

4949
## example
5050

@@ -81,7 +81,7 @@ if __name__ == "__main__":
8181
parser = PreParser(
8282
url_list=url_list,
8383
request_call_back_func=handle_preparser_result,
84-
parser_mode='api', # this mode depands on you set, you can use the "api" or "html"
84+
parser_mode='api', # this mode depands on you set, you can use the "api", "html",or 'html_dynamic'
8585
start_threading=True,
8686
threading_mode='single',
8787
cached_data=True,
@@ -114,6 +114,8 @@ Get help ➡️ [Github issue](https://github.com/BertramYe/preparser/issues)
114114

115115
# Update logs
116116

117+
* `version 2.0.6 `: add the `html_dynamic_scope` parameters to let user can specified the whole dynamic parse scope, which can help faster the preparser speed when the `parser_mode` is `html_dynamic` . and resort the additional tools into the `ToolsHelper` package.
118+
117119
* `version 2.0.5 `: remove the dynamic mode browser core install from setup into package call.
118120

119121
* `version 2.0.4 `: test the installing process command.

preparser/DynamicHelper.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from typing import Literal,Optional
66
from playwright.sync_api import sync_playwright
77

8+
9+
Moniter_Notes = list[str,Literal['attached', 'detached', 'hidden', 'visible']] | None
10+
811
class Dynamicer():
912
"""
1013
install the Browser Core
@@ -38,7 +41,7 @@ def _check_dynamic_async_env(self) -> int:
3841
self._async_index = installed_browser_index
3942
return installed_browser_index
4043

41-
def _get_dynamic_html(self,url:str) -> str | None:
44+
def _get_dynamic_html(self,url:str,moniter_scope:Moniter_Notes = None) -> str | None:
4245
try:
4346
if 0 <= self._async_index < 3:
4447
with sync_playwright() as p:
@@ -50,13 +53,27 @@ def _get_dynamic_html(self,url:str) -> str | None:
5053
browser = p.webkit.launch(headless=True)
5154
page = browser.new_page()
5255
page.goto(url)
53-
html = page.content()
56+
html = None
57+
if moniter_scope is not None:
58+
target_element = page.wait_for_selector(moniter_scope[0],state=moniter_scope[1])
59+
if target_element:
60+
target_element.scroll_into_view_if_needed()
61+
html = target_element.as_element().inner_html()
62+
# print(target_element.eval_on_selector_all('.row', 'elements => elements.map(el => el.outerHTML)'))
63+
# stop all of rest pages resouce loading to faster the loading speed
64+
page.route("**/*",lambda route,request:route.abort())
65+
# html = page.content()
66+
# stop specified type of resource loading, here below is the ['image', 'script'] two kind of the resouce type
67+
# page.route("**/*", lambda route, request: route.abort() if request.resource_type() in ['image', 'script'] else route.continue_())
68+
else:
69+
html = page.content()
70+
page.close()
5471
browser.close()
5572
return html
5673
else:
5774
return None
5875
except Exception as err:
59-
print(f'error when doing dynamic html parse , error: {err} !')
76+
print(f'error when parsing dynamic html , error: {err} !')
6077
return None
6178

6279
def init_install_browser(self):

preparser/PreParseHelper.py

Lines changed: 35 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
from bs4 import BeautifulSoup
44
from typing import Callable,Literal,Any
55
from urllib.parse import urlparse
6-
from playwright.sync_api import sync_playwright,Browser
7-
from os import path
86
from .TaskHelper import Tasker
9-
from .DynamicHelper import Dynamicer
7+
from .DynamicHelper import Dynamicer,Moniter_Notes
108
# typing
119
Json_Data = dict[str, Any]
1210

@@ -16,30 +14,42 @@ class PreParser():
1614
A slight PreParser oject to handle the parsing task with threading pools or other methods from webpage urls or api urls.
1715
1816
Parameters:
19-
url_list (list): The list of URLs to parse from. Default is an empty list.
20-
request_call_back_func (Callable[[str,BeautifulSoup | Dict[str, Any]], Any] | None): A callback function according to the parser_mode to handle the `BeautifulSoup` object or request `json` Object. and if you want to show your business process failed, you can return `None`, otherwise please return a `not None` Object.
21-
parser_mode (Literal['html','api']): the pre-parsing datas mode,default is html,
17+
url_list(list):The list of URLs to parse from. Default is an empty list.
18+
request_call_back_func (Callable[[str,BeautifulSoup | Dict[str, Any]], Any] | None):A callback function according to the parser_mode to handle the `BeautifulSoup` object or request `json` Object. and if you want to show your business process failed, you can return `None`, otherwise please return a `not None` Object.
19+
parser_mode(Literal['html','api']): the pre-parsing datas mode,default is html,
2220
`html`: parse the content from static html, and return an `BeautifulSoup` Object.
2321
`api`: parse the datas from an api, and return the `json` Object.
2422
`html_dynamic`: parse from the whole webpage html content and return an `BeautifulSoup` Object, even the content that generated by the dynamic js code.
25-
cached_data (bool): weather cache the parsed datas, defalt is False.
26-
start_threading (bool): Whether to use threading pool for parsing the data. Default is False.
27-
threading_mode (Literal['map','single']): to run the task mode,default is `single`
23+
cached_data(bool): weather cache the parsed datas, defalt is False.
24+
start_threading(bool): Whether to use threading pool for parsing the data. Default is False.
25+
threading_mode(Literal['map','single']): to run the task mode,default is `single`.
2826
`map`: use the `map` func of the theading pool to distribute tasks.
2927
`single`: use the `submit` func to distribute the task one by one into the theading pool.
30-
stop_when_task_failed (bool) : wheather need stop when you failed to get request from a Url,default is True
31-
threading_numbers (int): The maximum number of threads in the threading pool. Default is 3.
32-
checked_same_site (bool): wheather need add more headers info to pretend requesting in a same site to parse datas, default is True,to resolve the CORS Block.
28+
stop_when_task_failed(bool): wheather need stop when you failed to get request from a Url,default is True.
29+
threading_numbers(int): The maximum number of threads in the threading pool. Default is 3.
30+
checked_same_site(bool): wheather need add more headers info to pretend requesting in a same site to parse datas, default is True,to resolve the CORS Block.
31+
html_dynamic_scope(list[str,Literal['attached', 'detached', 'hidden', 'visible']] | None): point and get the specied scope dom of the whole page html, default is None, which stands for the whole page dom.
32+
else if this value was set, the parameter should be a list(2) Object.
33+
1. the first value is a tag <a href="https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector"> selecter</a>.
34+
for example, 'div#main' mean a div tag with 'id=main', 'div.test' will get the the first matched div tag with 'class = test'.
35+
but don't make the selecter too complex or matched the mutiple parent dom, otherwise you can't get their inner_html() correctly or time out.
36+
and finally you can get the BeautifulSoup object of the inner_html from this selecter selected tag in the `request_call_back_func`.
37+
2. the secound value should be one of the values below:
38+
`attached`: wait for element to be present in DOM
39+
`detached`: wait for element to not be present in DOM.
40+
`hidden`: wait for element to have non-empty bounding box and no `visibility:hidden`. Note that element,without any content or with `display:none` has an empty bounding box and is not considered visible.
41+
`visible`: wait for element to be either detached from DOM, or have an empty bounding box or `visibility:hidden`. This is opposite to the 'visible' option.
3342
Attributes:
34-
url_list (list): The list of URLs to parse from.
35-
request_call_back_func (Callable[[str,BeautifulSoup | Dict[str, Any]], bool] | None): The callback function to process the BeautifulSoup Or Json object.
36-
parser_mode (Literal['html','api']): the preparse datas mode.
37-
cached_data (bool): weather to cache the parse datas.
38-
start_threading (bool): Whether to use threading pool.
39-
threading_mode (Literal['map','single']): to run the task mode.
40-
stop_when_task_failed (bool) : wheather need stop when you failed to get request from a Url.
41-
threading_numbers (int): The maximum number of threads.
42-
checked_same_site (bool): wheather need add more headers info to pretend requesting in a same site to parse datas, to resolve the CORS Block.
43+
url_list(list):The list of URLs to parse from.
44+
request_call_back_func(Callable[[str,BeautifulSoup | Dict[str, Any]], bool] | None): The callback function to process the BeautifulSoup Or Json object.
45+
parser_mode(Literal['html','api']): the preparse datas mode.
46+
cached_data(bool): weather to cache the parse datas.
47+
start_threading(bool): Whether to use threading pool.
48+
threading_mode(Literal['map','single']): to run the task mode.
49+
stop_when_task_failed(bool): wheather need stop when you failed to get request from a Url.
50+
threading_numbers(int): The maximum number of threads.
51+
checked_same_site(bool): wheather need add more headers info to pretend requesting in a same site to parse datas, to resolve the CORS Block.
52+
html_dynamic_scope(list[str,Literal['attached', 'detached', 'hidden', 'visible']] | None): to get and load specified scope html nodes resouce.
4353
"""
4454
def __init__(self,
4555
url_list: list[str] = [],
@@ -50,7 +60,8 @@ def __init__(self,
5060
threading_mode:Literal['map','single'] = 'single',
5161
stop_when_task_failed:bool = True,
5262
threading_numbers: int = 3,
53-
checked_same_site:bool = True
63+
checked_same_site:bool = True,
64+
html_dynamic_scope:Moniter_Notes= None, # await loaded contions
5465
) -> None:
5566
self.to_parse_urls = url_list
5667
self.start_threading = start_threading
@@ -66,6 +77,7 @@ def __init__(self,
6677
self.dynamicer= Dynamicer()
6778
self._stop_running = False
6879
self._async_bundle_index = self._get_aync_bundle_index()
80+
self._html_dynamic_scope = html_dynamic_scope
6981

7082

7183
def _get_aync_bundle_index(self) -> int:
@@ -81,7 +93,7 @@ def _get_aync_bundle_index(self) -> int:
8193

8294
def _get_synamic_soup(self,url:str) -> BeautifulSoup | None:
8395
if self._async_bundle_index >= 0:
84-
html = self.dynamicer._get_dynamic_html(url)
96+
html = self.dynamicer._get_dynamic_html(url,self._html_dynamic_scope)
8597
if html:
8698
return BeautifulSoup(html, 'html.parser')
8799
return None
@@ -183,84 +195,7 @@ def _create_request_headers(self,url:str) -> dict[str,str]:
183195
return headers
184196

185197

186-
def find_all_betweem_same_level_nodes(start_node:BeautifulSoup | None =None,
187-
end_node:BeautifulSoup | None =None,
188-
include_start_node:bool=False,
189-
include_end_node:bool=False,
190-
parent_node:BeautifulSoup | None = None
191-
) -> BeautifulSoup | None:
192-
"""
193-
this function is help finding out the website elements nodes between specified two same level elements notes and finally return a new BeautifulSoup Object
194-
195-
Parameters:
196-
start_node (BeautifulSoup | None): The start elements nodes, defaut is None, which means from the target first one element to start get the element node
197-
end_node (BeautifulSoup | None): The end elements nodes, defaut is None, which means from the last element to start get the element node
198-
include_start_node (bool): when get the element nodes, weather include the start node, default is False.
199-
include_end_node (bool): when get the element nodes, weather include the end node, default is False.
200-
parent_node (BeautifulSoup | None): the parent element node which contained the start_node and end_node,
201-
if you set it , we just find the node in current nodes' children element,
202-
also default it can be None,which will be the parent node of the start_node or end_node.
203-
"""
204-
205-
206-
if (not start_node) and (not end_node):
207-
print("error: start_node and end_node are both None !!!")
208-
return None
209-
valid_numbers = 0
210-
parent = parent_node if parent_node else (start_node.parent if start_node else end_node.parent)
211-
parent_chidren_list = list(parent.children)
212-
if start_node:
213-
start_index = parent.index(start_node) + 1
214-
else:
215-
start_index = 1
216-
if end_node:
217-
end_index = parent.index(end_node)
218-
else:
219-
end_index = parent_chidren_list.__len__()
220-
if include_start_node:
221-
valid_numbers += 1
222-
start_index -= 1
223-
if include_end_node:
224-
valid_numbers += 1
225-
end_index += 1
226-
between_nodes_list = parent_chidren_list[start_index:end_index]
227-
if len(between_nodes_list) == valid_numbers:
228-
print('no sibling nodes between the start_node and end_node !!!')
229-
return None
230-
else:
231-
html_str = ''.join(str(node) for node in between_nodes_list)
232-
return BeautifulSoup(html_str, 'html.parser')
233-
234198

235-
def get_per_table_data(table_soup:BeautifulSoup) -> list[list[str]]:
236-
"""
237-
get the table datas from the standard element of table, which has 1 row head at most.
238-
239-
"""
240-
final_tables_row = []
241-
# thead
242-
thead = table_soup.find('thead')
243-
if thead:
244-
thead_th:list[BeautifulSoup] = thead.find('tr').find_all('th') # default the table only one title tr of thead
245-
th_datas = []
246-
for th in thead_th:
247-
# th_tx = repr(th.get_text(strip=True)).strip("'")
248-
th_tx = th.get_text(strip=True) # strip=True: get ride of the emply space from start and end
249-
th_datas.append(th_tx)
250-
final_tables_row.append(th_datas)
251-
# tbody
252-
tbody = table_soup.find('tbody')
253-
if tbody:
254-
tbody_tr:list[BeautifulSoup] = tbody.find_all('tr')
255-
for tr in tbody_tr:
256-
tr_datas = []
257-
tds:list[BeautifulSoup] = tr.find_all('td')
258-
for td in tds:
259-
# td_txt = repr(td.get_text(strip=True)).strip("'")
260-
td_txt = td.get_text(strip=True) # strip=True: get ride of the emply space from start and end
261-
tr_datas.append(td_txt)
262-
final_tables_row.append(tr_datas)
263-
return final_tables_row
264199

265200

266201

0 commit comments

Comments
 (0)