Skip to content

Commit 660daec

Browse files
committed
add the html_dynamic parse mode
1 parent 05c6613 commit 660daec

File tree

7 files changed

+298
-31
lines changed

7 files changed

+298
-31
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
__pycache__
12
*__pycache__
23
Pipfile
34
result

InstallHelper.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from os import path,getenv,listdir
2+
from setuptools import Command
3+
from subprocess import check_call
4+
from setuptools.command.install import install
5+
from typing import Literal,Optional
6+
import sys
7+
from platform import system
8+
9+
10+
class BrowserCoreInstaller(Command):
11+
"""
12+
install the Browser Core
13+
"""
14+
# description = "Install the specified preparser browser core (chromium, firefox, or webkit)"
15+
# define command
16+
# user_options = [
17+
# ('browser=', None, 'Specify the preparser browser to install: chromium, firefox, or webkit')
18+
# ]
19+
20+
def initialize_options(self):
21+
self.browser_list = ['chromium','firefox','webkit']
22+
self.installed_browsers = []
23+
self.need_recheck_browsers = []
24+
self.browser_cache_path = None
25+
self.preched_installed_browsers = self._get_pre_installed_browsers()
26+
27+
def _get_pre_installed_browsers(self) -> list[str]:
28+
preched_installed_browsers = []
29+
os_type = system()
30+
browser_cache_path = path.expanduser("~/.cache/ms-playwright/")
31+
if os_type == "Windows":
32+
browser_cache_path = path.join(getenv("APPDATA"), "Local", "ms-playwright")
33+
if path.exists(browser_cache_path):
34+
self.browser_cache_path = browser_cache_path
35+
folder_name_list = listdir(self.browser_cache_path)
36+
for folder_name in folder_name_list:
37+
abs_path = path.join(self.browser_cache_path, folder_name)
38+
if path.isdir(abs_path):
39+
for key in self.browser_list:
40+
if folder_name.startswith(key):
41+
preched_installed_browsers.append(key)
42+
else:
43+
preched_installed_browsers = []
44+
return preched_installed_browsers
45+
46+
47+
def precheck_installed_browsers(self):
48+
self.installed_browsers = []
49+
self.need_recheck_browsers = []
50+
# get the path
51+
for browser_name in self.browser_list:
52+
if browser_name in self.preched_installed_browsers:
53+
print(f'find that browser {browser_name} of preparser installed !')
54+
operate_choice = self.check_choice_avalible(f" do you want to reinstall,remove or keep it ? (1 : reinstall, 2: remove , 3: keep.): ",['1','2','3'])
55+
if operate_choice == "3":
56+
self.installed_browsers.append(browser_name)
57+
else: # "1" or "2"
58+
self.operate_browser("uninstall",browser_name)
59+
if operate_choice == '1':
60+
self.operate_browser("install",browser_name)
61+
self.installed_browsers.append(browser_name)
62+
else:
63+
self.need_recheck_browsers.append(browser_name)
64+
65+
66+
def init_install_browser(self):
67+
# if not , just let the user to choose
68+
print("please choose a preparser browser to install: ")
69+
print("[1] chromium, [2] firefox, [3] webkit.")
70+
choice = self.check_choice_avalible(f'please input a number to choose a browser (1/2/3):',['1','2','3'])
71+
browser = self.browser_list[int(choice)-1]
72+
self.operate_browser("install",browser)
73+
return browser
74+
75+
def operate_browser(self,command_type:Literal["install","uninstall"], browser_name:str):
76+
# install specified browser
77+
print(f"{command_type}ing preparser browser {browser_name} ...")
78+
check_call([sys.executable, "-m", "playwright", command_type, browser_name])
79+
80+
def check_choice_avalible(self, alert_message: str, valid_choices: list[str]) -> Optional[str]:
81+
while True:
82+
choice = input(alert_message)
83+
if choice in valid_choices:
84+
return choice
85+
else:
86+
print(f"Invalid choice, available choices: {','.join(valid_choices)}. Please try again.")
87+
88+
def finalize_options(self):
89+
if self.installed_browsers.__len__() == 0:
90+
print("warning: to use preparser, you need at least one of the preparser browsers installed .")
91+
install_browser = self.init_install_browser()
92+
self.installed_browsers.append(install_browser)
93+
94+
95+
def run(self):
96+
print("checking weather there were preparser browsers' core installed .....")
97+
self.precheck_installed_browsers()
98+
recheck_browsers_number = len(self.need_recheck_browsers)
99+
total_browsers_number = len(self.browser_list)
100+
if recheck_browsers_number > 0:
101+
if recheck_browsers_number < total_browsers_number:
102+
print(f'there were browsers of preparser not installed {",".join(self.need_recheck_browsers)}.')
103+
print(f"warning: added more or not won't effect your next process, as you have installed {",".join(self.installed_browsers)} ")
104+
choice = self.check_choice_avalible('do you still want to add them ? (yes/no) : ',['yes','no'])
105+
if choice == 'yes':
106+
for browser in self.need_recheck_browsers:
107+
to_install_choice = self.check_choice_avalible(f'do you want to install {browser} of preparser ? (yes/no) : ',['yes','no'])
108+
if to_install_choice == 'yes':
109+
self.operate_browser("install",browser)
110+
self.installed_browsers.append(install_browser)
111+
else: # no browser installed
112+
install_browser = self.init_install_browser()
113+
self.installed_browsers.append(install_browser)
114+
115+
116+
class PreInstaller(install):
117+
def run(self):
118+
print('Prechecking the environment status, before install preparser!!!')
119+
if 'bdist_wheel' in sys.argv or 'build' in sys.argv:
120+
# Avoid running the browser installer during the build phase
121+
print("Skipping browser core installation during build.")
122+
else:
123+
# precheck the environment status before install
124+
print('Prechecking the environment status, before install preparser!!!')
125+
# Execute playwright install command
126+
BrowserCoreInstaller(self.distribution).run()
127+
print("All prechecks finished, begin installing preparser...")
128+
# excute the default running
129+
self.run(self)
130+
131+
132+

MANIFEST.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
include README.md
22
include LICENSE
3-
3+
include InstallHelper.py
44
# include setup.py # no need include current package, which will auto include the setup.py file when uploaded into PyPI
55

66
# get ride of the Pipfile files, as it just been use for the virtual environment building

README.md

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ this is a sight Parser to help you pre_parser the datas from `specified website
44

55
# Attention
66

7-
as this slight pre_parser was just based the module of the `requests` and `beautifulsoup4`, which mainly was used to parse the data from the `api` and page randered as `static html`, so it can't directly parsed datas that need waiting the whole website pages was loaded, but for this function , maybe I will added later in future.
7+
as this slight pre_parser for the old version 1.0.0, which only can help preparser the `static html` or `api` inform, but now from the 2.0.0 , I have added an new `html_dynamic` mode, which will help get all inform even generated by the `JS` code.
88

99
```bash
1010

@@ -37,7 +37,7 @@ here below are some of the parameters you can use for initai the Object `PrePars
3737
| --------------------- | ----------------- |-------------------------------------------------------- |
3838
| url_list | list | The list of URLs to parse from. Default is an empty list. |
3939
| request_call_back_func | Callable or None | A callback function according to the parser_mode to handle the `BeautifulSoup` object or request `json` Object. and if you want to show your business process failed, you can return `None`, otherwise please return a `not None` Object. |
40-
| parser_mode | `'html'` or `'api'` | The pre-parsing datas mode,default is `'html'`.<br/> `html`: use the bs4 to parse the datas, and return an `BeautifulSoup` Object. <br/> `api` : use requests only and return an `json` object. <br/> **and all of them you can get it when you set the `request_call_back_func`, otherwise get it via the object of `PreParer(....).cached_request_datas` |
40+
| parser_mode | `'html'`, `'api'` or `'html_dynamic'` | The pre-parsing datas mode,default is `'html'`.<br/> `html`: parse the content from static html, and return an `BeautifulSoup` Object. <br/> `api`: parse the datas from an api, and return the `json` Object. <br/> `html_dynamic`: parse from the whole webpage html content and return an `BeautifulSoup` Object, even the content that generated by the dynamic js code. <br/> **and all of Object you can get when you defined the `request_call_back_func`, otherwise get it via the object of `PreParer(....).cached_request_datas` |
4141
| cached_data | bool | weather cache the parsed datas, defalt is False. |
4242
| start_threading | bool | Whether to use threading pool for parsing the data. Default is `False`.|
4343
| threading_mode | `'map'` or `'single'` | to run the task mode, default is `single`. <br/> `map`: use the `map` func of the theading pool to distribute tasks. <br/> `single`: use the `submit` func to distribute the task one by one into the theading pool. |
@@ -94,7 +94,7 @@ if __name__ == "__main__":
9494
parser.start_parse()
9595

9696
# when all task finished, you can get the all task result result like below:
97-
all_results = parser.cached_request_datas
97+
all_result = parser.cached_request_datas
9898

9999
# if you want to terminal, just execute the function here below
100100
# parser.stop_parse()
@@ -111,3 +111,19 @@ if __name__ == "__main__":
111111

112112
Get help ➡️ [Github issue](https://github.com/BertramYe/preparser/issues)
113113

114+
115+
# Update logs
116+
117+
* `version 2.0.5 `: remove the dynamic mode browser core install from setup into package call.
118+
119+
* `version 2.0.4 `: test the installing process command.
120+
121+
* `version 2.0.3 `: optimise the `error` alert for `html_dynamic`.
122+
123+
* `version 2.0.2 `: correct the README Doc of `parser_mode`.
124+
125+
* `version 2.0.1 `: update the README Doc.
126+
127+
* `version 2.0.0 `: add the new `parser_mode` of the `html_dynamic`, which help `preparser` all of the content from `html` , event it generated by the `JS` code.
128+
129+
* `version 1.0.0 `: basical version, only `perparser` the static `html` and `api` content.

preparser/DynamicHelper.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
2+
import sys
3+
from os import path
4+
from subprocess import check_call
5+
from typing import Literal,Optional
6+
from playwright.sync_api import sync_playwright
7+
8+
class Dynamicer():
9+
"""
10+
install the Browser Core
11+
"""
12+
def __init__(self) -> None:
13+
self.browser_list = ['chromium','firefox','webkit']
14+
self._async_index = -1
15+
16+
def _check_dynamic_async_env(self) -> int:
17+
installed_browser_index = -1
18+
try:
19+
with sync_playwright() as p:
20+
browser_Bundle_List = [
21+
p.chromium,
22+
p.firefox,
23+
p.webkit
24+
]
25+
for i,browser_budle in enumerate(browser_Bundle_List):
26+
if path.exists(browser_budle.executable_path):
27+
installed_browser_index = i
28+
break
29+
if installed_browser_index == -1:
30+
installed_browser_index = self.init_install_browser()
31+
# else:
32+
# because so far is in the use checking, so no need add the re-install logical
33+
# fro the precheck and install will added into the setup logical in the future
34+
except Exception as error:
35+
print(f"error: when check the preparser browser bundle, error:{error} !!!")
36+
print(f'please try again, if failed again, please reinstall preparser !!!')
37+
finally:
38+
self._async_index = installed_browser_index
39+
return installed_browser_index
40+
41+
def _get_dynamic_html(self,url:str) -> str | None:
42+
try:
43+
if 0 <= self._async_index < 3:
44+
with sync_playwright() as p:
45+
if self._async_index == 0:
46+
browser = p.chromium.launch(headless=True)
47+
elif self._async_index == 1:
48+
browser = p.firefox.launch(headless=True)
49+
else:
50+
browser = p.webkit.launch(headless=True)
51+
page = browser.new_page()
52+
page.goto(url)
53+
html = page.content()
54+
browser.close()
55+
return html
56+
else:
57+
return None
58+
except Exception as err:
59+
print(f'error when doing dynamic html parse , error: {err} !')
60+
return None
61+
62+
def init_install_browser(self):
63+
# if not , just let the user to choose
64+
print("please choose a preparser browser to install: ")
65+
print("[1] chromium, [2] firefox, [3] webkit.")
66+
choice = self.check_choice_avalible(f'please input a number to choose a browser (1/2/3):',['1','2','3'])
67+
browser = self.browser_list[int(choice)-1]
68+
self.operate_browser("install",browser)
69+
return int(choice)-1
70+
71+
def operate_browser(self,command_type:Literal["install","uninstall"], browser_name:str):
72+
# install specified browser
73+
print(f"{command_type}ing preparser browser {browser_name} ...")
74+
check_call([sys.executable, "-m", "playwright", command_type, browser_name])
75+
76+
def check_choice_avalible(self, alert_message: str, valid_choices: list[str]) -> Optional[str]:
77+
while True:
78+
choice = input(alert_message)
79+
if choice in valid_choices:
80+
return choice
81+
else:
82+
print(f"Invalid choice, available choices: {','.join(valid_choices)}. Please try again.")
83+

0 commit comments

Comments
 (0)