BertramYe
diff --git a/‎README.md‎
Lines changed: 5 additions & 3 deletions b/‎README.md‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎preparser/DynamicHelper.py‎
Lines changed: 20 additions & 3 deletions b/‎preparser/DynamicHelper.py‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎preparser/PreParseHelper.py‎
Lines changed: 35 additions & 100 deletions b/‎preparser/PreParseHelper.py‎
Lines changed: 35 additions & 100 deletions
@@ -30,7 +30,7 @@ $ pip install preparser
 
 ## parameters
 
-here below are some of the parameters you can use for initai the Object `PreParser` from the package `preparser`:
+here below are some of the parameters you can use for initial the Object `PreParser` from the package `preparser`:
 
 
 |        Parameters      | Type                | Description                                               |
@@ -44,7 +44,7 @@ here below are some of the parameters you can use for initai the Object `PrePars
 | stop_when_task_failed | bool | wheather need stop when you failed to get request from a Url,default is `True` |
 | threading_numbers | int | The maximum number of threads in the threading pool. Default is `3`. |
 | checked_same_site | bool |  wheather need add more headers info to pretend requesting in a same site to parse datas, default is `True`,to resolve the `CORS` Block. |
-
+| html_dynamic_scope | list or None | point and get the specied scope dom of the whole page html, default is None,which stands for the whole page.<br />if this value was set, the parameter should be a list(2) Object. <br/> 1. the first value is a tag <a href="https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector"> selecter</a>. <br /> for example, 'div#main' mean a div tag with 'id=main', 'div.test' will get the the first matched div tag with 'class = test'. but don't make the selecter too complex or matched the mutiple parent dom, otherwise you can't get their inner_html() correctly or time out, and finally you can get the BeautifulSoup object of the inner_html from this selecter selected tag in the `request_call_back_func`. <br /> 2. the secound value should be one of the values below: <br />`attached`: wait for element to be present in DOM. <br />`detached`: wait for element to not be present in DOM. <br />`hidden`: wait for element to have non-empty bounding box and no 'visibility:hidden'. Note that element,without any content or with 'display:none' has an empty bounding box and is not considered visible. <br /> `visible`: wait for element to be either detached from DOM, or have an empty bounding box or 'visibility:hidden'. This is opposite to the 'visible' option. 
 
 ## example
 
@@ -81,7 +81,7 @@ if __name__ == "__main__":
     parser = PreParser(
         url_list=url_list,
         request_call_back_func=handle_preparser_result,
-        parser_mode='api',    # this mode depands on you set, you can use the "api" or "html"
+        parser_mode='api',    # this mode depands on you set, you can use the "api", "html",or 'html_dynamic'
         start_threading=True,
         threading_mode='single',
         cached_data=True,
@@ -114,6 +114,8 @@ Get help ➡️ [Github issue](https://github.com/BertramYe/preparser/issues)
 
 # Update logs
 
+* `version 2.0.6 `: add the `html_dynamic_scope` parameters to let user can specified the whole dynamic parse scope, which can help faster the preparser speed when the `parser_mode` is `html_dynamic` . and resort the additional tools into the `ToolsHelper` package.
+
 * `version 2.0.5 `: remove the dynamic mode browser core install from setup into package call.
 
 * `version 2.0.4 `: test the installing process command.
 
@@ -5,6 +5,9 @@
 from typing import Literal,Optional
 from playwright.sync_api import sync_playwright 
 
+
+Moniter_Notes = list[str,Literal['attached', 'detached', 'hidden', 'visible']] | None
+
 class Dynamicer():
     """
         install the Browser Core
@@ -38,7 +41,7 @@ def _check_dynamic_async_env(self) -> int:
             self._async_index = installed_browser_index
             return installed_browser_index
 
-    def _get_dynamic_html(self,url:str) -> str | None:
+    def _get_dynamic_html(self,url:str,moniter_scope:Moniter_Notes = None) -> str | None:
         try:
             if 0 <= self._async_index < 3:
                 with sync_playwright() as p:
@@ -50,13 +53,27 @@ def _get_dynamic_html(self,url:str) -> str | None:
                         browser = p.webkit.launch(headless=True)
                     page = browser.new_page()
                     page.goto(url)
-                    html = page.content()
+                    html = None
+                    if moniter_scope is not None:
+                        target_element = page.wait_for_selector(moniter_scope[0],state=moniter_scope[1])
+                        if target_element:
+                            target_element.scroll_into_view_if_needed()
+                            html = target_element.as_element().inner_html()
+                            # print(target_element.eval_on_selector_all('.row', 'elements => elements.map(el => el.outerHTML)'))
+                        # stop all of rest pages resouce loading to faster the loading speed 
+                        page.route("**/*",lambda route,request:route.abort())
+                        # html = page.content()
+                        # stop specified type of resource loading, here below is the ['image', 'script'] two kind of the resouce type
+                        # page.route("**/*", lambda route, request: route.abort() if request.resource_type() in ['image', 'script'] else route.continue_())
+                    else:
+                        html = page.content()
+                    page.close()
                     browser.close()
                     return html
             else:
                 return None
         except Exception as err:
-            print(f'error when doing dynamic html parse , error: {err} !')
+            print(f'error when parsing dynamic html , error: {err} !')
             return None
 
     def init_install_browser(self):
 
@@ -3,10 +3,8 @@
 from bs4 import BeautifulSoup
 from typing import Callable,Literal,Any 
 from urllib.parse import urlparse
-from playwright.sync_api import sync_playwright,Browser 
-from os import path
 from .TaskHelper import Tasker
-from .DynamicHelper import Dynamicer
+from .DynamicHelper import Dynamicer,Moniter_Notes
 # typing 
 Json_Data = dict[str, Any]
 
@@ -16,30 +14,42 @@ class PreParser():
         A slight PreParser oject to handle the parsing task with threading pools or other methods from webpage urls or api urls.
 
         Parameters:
-            url_list (list): The list of URLs to parse from. Default is an empty list.
-            request_call_back_func (Callable[[str,BeautifulSoup | Dict[str, Any]], Any] | None): A callback function according to the parser_mode to handle the `BeautifulSoup` object or request `json` Object. and if you want to show your business process failed, you can return `None`, otherwise please return a `not None` Object.  
-            parser_mode (Literal['html','api']): the pre-parsing datas mode,default is html, 
+            url_list(list):The list of URLs to parse from. Default is an empty list.
+            request_call_back_func (Callable[[str,BeautifulSoup | Dict[str, Any]], Any] | None):A callback function according to the parser_mode to handle the `BeautifulSoup` object or request `json` Object. and if you want to show your business process failed, you can return `None`, otherwise please return a `not None` Object.  
+            parser_mode(Literal['html','api']): the pre-parsing datas mode,default is html, 
                                                 `html`: parse the content from static html, and return an `BeautifulSoup` Object. 
                                                 `api`: parse the datas from an api, and return the `json` Object.
                                                 `html_dynamic`: parse  from  the whole webpage html content and return an `BeautifulSoup` Object, even the content that generated by the dynamic js code.
-            cached_data (bool): weather cache the parsed datas, defalt is False.
-            start_threading (bool): Whether to use threading pool for parsing the data. Default is False.
-            threading_mode (Literal['map','single']): to run the task mode,default is `single` 
+            cached_data(bool): weather cache the parsed datas, defalt is False.
+            start_threading(bool): Whether to use threading pool for parsing the data. Default is False.
+            threading_mode(Literal['map','single']): to run the task mode,default is `single`. 
                                                 `map`: use the `map` func of the theading pool to distribute tasks.
                                                 `single`: use the `submit` func to distribute the task one by one into the theading pool.
-            stop_when_task_failed (bool) : wheather need stop when you failed to get request from a Url,default is True
-            threading_numbers (int): The maximum number of threads in the threading pool. Default is 3.
-            checked_same_site (bool): wheather need add more headers info to pretend requesting in a same site to parse datas, default is True,to resolve the CORS Block.
+            stop_when_task_failed(bool): wheather need stop when you failed to get request from a Url,default is True.
+            threading_numbers(int): The maximum number of threads in the threading pool. Default is 3.
+            checked_same_site(bool): wheather need add more headers info to pretend requesting in a same site to parse datas, default is True,to resolve the CORS Block.
+            html_dynamic_scope(list[str,Literal['attached', 'detached', 'hidden', 'visible']] | None): point and get the specied scope dom of the whole page html, default is None, which stands for the whole page dom.
+                                                                                                        else if this value was set, the parameter should be a list(2) Object.
+                                                                                                        1. the first value is a tag <a href="https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector"> selecter</a>.
+                                                                                                        for example, 'div#main' mean a div tag with 'id=main', 'div.test' will get the the first matched div tag with 'class = test'. 
+                                                                                                        but don't make the selecter too complex or matched the mutiple parent dom, otherwise you can't get their inner_html() correctly or time out.
+                                                                                                        and finally you can get the BeautifulSoup object of the inner_html from this selecter selected tag in the `request_call_back_func`.
+                                                                                                        2. the secound value should be one of the values below:
+                                                                                                          `attached`: wait for element to be present in DOM
+                                                                                                          `detached`: wait for element to not be present in DOM.
+                                                                                                          `hidden`: wait for element to have non-empty bounding box and no `visibility:hidden`. Note that element,without any content or with `display:none` has an empty bounding box and is not considered visible.
+                                                                                                          `visible`: wait for element to be either detached from DOM, or have an empty bounding box or `visibility:hidden`. This is opposite to the 'visible' option. 
         Attributes:
-            url_list (list): The list of URLs to parse from.
-            request_call_back_func (Callable[[str,BeautifulSoup | Dict[str, Any]], bool] | None): The callback function to process the BeautifulSoup Or Json object.
-            parser_mode (Literal['html','api']): the preparse  datas mode.
-            cached_data (bool): weather to cache the parse datas.
-            start_threading (bool): Whether to use threading pool.
-            threading_mode (Literal['map','single']): to run the task mode.
-            stop_when_task_failed (bool) : wheather need stop when you failed to get request from a Url.
-            threading_numbers (int): The maximum number of threads.
-            checked_same_site (bool): wheather need add more headers info to pretend requesting in a same site to parse datas, to resolve the CORS Block.
+            url_list(list):The list of URLs to parse from.
+            request_call_back_func(Callable[[str,BeautifulSoup | Dict[str, Any]], bool] | None): The callback function to process the BeautifulSoup Or Json object.
+            parser_mode(Literal['html','api']): the preparse  datas mode.
+            cached_data(bool): weather to cache the parse datas.
+            start_threading(bool): Whether to use threading pool.
+            threading_mode(Literal['map','single']): to run the task mode.
+            stop_when_task_failed(bool): wheather need stop when you failed to get request from a Url.
+            threading_numbers(int): The maximum number of threads.
+            checked_same_site(bool): wheather need add more headers info to pretend requesting in a same site to parse datas, to resolve the CORS Block.
+            html_dynamic_scope(list[str,Literal['attached', 'detached', 'hidden', 'visible']] | None): to get and load specified scope html nodes resouce. 
     """
     def __init__(self, 
                  url_list: list[str] = [],
@@ -50,7 +60,8 @@ def __init__(self,
                  threading_mode:Literal['map','single'] = 'single',
                  stop_when_task_failed:bool = True,
                  threading_numbers: int = 3,
-                 checked_same_site:bool = True
+                 checked_same_site:bool = True,
+                 html_dynamic_scope:Moniter_Notes= None,  # await loaded contions
                 ) -> None:
         self.to_parse_urls = url_list
         self.start_threading = start_threading
@@ -66,6 +77,7 @@ def __init__(self,
         self.dynamicer= Dynamicer()
         self._stop_running = False
         self._async_bundle_index = self._get_aync_bundle_index()
+        self._html_dynamic_scope = html_dynamic_scope
 
 
     def _get_aync_bundle_index(self) -> int:
@@ -81,7 +93,7 @@ def _get_aync_bundle_index(self) -> int:
 
     def _get_synamic_soup(self,url:str) -> BeautifulSoup | None:
         if self._async_bundle_index >= 0:
-            html = self.dynamicer._get_dynamic_html(url)
+            html = self.dynamicer._get_dynamic_html(url,self._html_dynamic_scope)
             if html:
                return BeautifulSoup(html, 'html.parser')
         return None
@@ -183,84 +195,7 @@ def _create_request_headers(self,url:str) -> dict[str,str]:
         return headers
 
 
-def find_all_betweem_same_level_nodes(start_node:BeautifulSoup | None =None,
-                           end_node:BeautifulSoup | None =None,
-                           include_start_node:bool=False,
-                           include_end_node:bool=False,
-                           parent_node:BeautifulSoup | None = None
-                           ) -> BeautifulSoup | None:
-        """   
-            this function is help finding out the website elements nodes between specified two same level elements notes and finally return a new BeautifulSoup Object
-            
-            Parameters:
-                start_node (BeautifulSoup | None): The start elements nodes, defaut is None, which means from the target first one element to start get the element node
-                end_node (BeautifulSoup | None):  The end elements nodes, defaut is None, which means from the last element to start get the element node
-                include_start_node (bool): when get the element nodes, weather include the start node, default is False.
-                include_end_node (bool): when get the element nodes, weather include the end node, default is False.
-                parent_node (BeautifulSoup | None): the parent element node which contained the start_node and end_node, 
-                                                    if you set it , we just find the node in current nodes' children element, 
-                                                    also default it can be None,which will be the parent node of the start_node or end_node.
-        """
-        
-        
-        if (not start_node)  and (not end_node):
-            print("error: start_node and end_node are both None !!!")
-            return None
-        valid_numbers = 0
-        parent = parent_node if parent_node else (start_node.parent if start_node else end_node.parent)
-        parent_chidren_list = list(parent.children)
-        if start_node:
-            start_index = parent.index(start_node) + 1
-        else:
-            start_index = 1
-        if end_node:
-            end_index = parent.index(end_node)
-        else:
-            end_index = parent_chidren_list.__len__()
-        if include_start_node:
-            valid_numbers += 1
-            start_index -= 1  
-        if include_end_node:
-            valid_numbers += 1
-            end_index += 1
-        between_nodes_list = parent_chidren_list[start_index:end_index]
-        if len(between_nodes_list) == valid_numbers:
-            print('no sibling nodes between the start_node and end_node !!!')
-            return None
-        else:
-            html_str = ''.join(str(node) for node in between_nodes_list)
-            return BeautifulSoup(html_str, 'html.parser')
-
 
-def get_per_table_data(table_soup:BeautifulSoup) -> list[list[str]]:
-    """
-        get the table datas from the standard element of table, which has 1 row head at most.
-    
-    """
-    final_tables_row = []
-    # thead
-    thead = table_soup.find('thead')
-    if thead:
-        thead_th:list[BeautifulSoup] = thead.find('tr').find_all('th')  # default the table only one title tr of thead
-        th_datas = []
-        for th in thead_th:
-            # th_tx = repr(th.get_text(strip=True)).strip("'")
-            th_tx = th.get_text(strip=True)   # strip=True: get ride of the emply space from start and end
-            th_datas.append(th_tx)
-        final_tables_row.append(th_datas)       
-    # tbody
-    tbody = table_soup.find('tbody')
-    if tbody:
-        tbody_tr:list[BeautifulSoup] = tbody.find_all('tr')
-        for tr in tbody_tr:
-            tr_datas = []
-            tds:list[BeautifulSoup] = tr.find_all('td')
-            for td in tds:
-                # td_txt = repr(td.get_text(strip=True)).strip("'")
-                td_txt = td.get_text(strip=True)    # strip=True: get ride of the emply space from start and end
-                tr_datas.append(td_txt)
-            final_tables_row.append(tr_datas)
-    return final_tables_row