Skip to content

Commit 3e07d63

Browse files
llm wrapper and prompt opz, base url bug, mp scraper opz
1 parent dd7d924 commit 3e07d63

9 files changed

+36
-58
lines changed

core/agents/get_info.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,9 @@ async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list
265265
for link in links:
266266
if link not in text_batch:
267267
if _logger:
268-
_logger.warning(f"model generating hallucination:\n{result[-1]}")
268+
_logger.warning(f"model generating hallucination:\n{link}\n{result[-1]}\n{text_batch}")
269269
if test_mode:
270-
print(f"model hallucination:\n{result[-1]}")
270+
print(f"model hallucination:\n{link}\n{result[-1]}\n{text_batch}")
271271
continue
272272
cache.add(link)
273273
text_batch = ''
@@ -343,5 +343,5 @@ async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_
343343
url_tags = re.findall(r'\[\d+\]', content)
344344
refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict}
345345
final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
346-
346+
347347
return final

core/agents/get_info_prompts.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636
{focus_statement}\n
3737
在提炼摘要时,请遵循以下原则:
3838
- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围
39-
- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文)
40-
- 摘要信息务必忠于原文'''
39+
- 摘要应当详实、充分,且绝对忠于原文
40+
- 如果摘要涉及的原文片段中包含类似"[3]"这样的引用标记,务必在摘要中保留相关标记'''
4141

4242
get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
4343
"""
@@ -55,8 +55,8 @@
5555
{focus_statement}\n
5656
When extracting summaries, please follow these principles:
5757
- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
58-
- The summary should be detailed and comprehensive
59-
- The summary should be faithful to the original text'''
58+
- The summary should be detailed and comprehensive and absolutely faithful to the original text
59+
- If the summary involves a reference marker like "[3]", it must be retained in the summary'''
6060

6161
get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
6262
"""

core/general_process.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- coding: utf-8 -*-
2+
import os
23
from utils.pb_api import PbTalker
34
from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
45
from agents.get_info import *
@@ -87,9 +88,10 @@ async def main_process(_sites: set | list):
8788
while working_list:
8889
url = working_list.pop()
8990
existing_urls.add(url)
91+
wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list')
9092
has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
9193
if has_common_ext:
92-
wiseflow_logger.info(f'{url} is a common file, skip')
94+
wiseflow_logger.debug(f'{url} is a common file, skip')
9395
continue
9496

9597
parsed_url = urlparse(url)
@@ -125,7 +127,6 @@ async def main_process(_sites: set | list):
125127
base_url = ''
126128
author = ''
127129
publish_date = ''
128-
129130
if not raw_markdown:
130131
wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
131132
continue
@@ -136,24 +137,22 @@ async def main_process(_sites: set | list):
136137
base_url = metadata_dict.get('base', '')
137138
if not base_url:
138139
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
139-
if not base_url.endswith('/'):
140-
# 如果路径不以 / 结尾,则去掉最后一个路径段
141-
base_url = base_url.rsplit('/', 1)[0] + '/'
142-
140+
143141
if not author:
144142
author = metadata_dict.get('author', '')
145143
if not publish_date:
146144
publish_date = metadata_dict.get('publish_date', '')
147-
145+
148146
link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
149-
147+
150148
if link_dict and links_parts:
151149
prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
152150
links_texts = []
153151
for _parts in links_parts:
154152
links_texts.extend(_parts.split('\n\n'))
155153
more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
156154
if more_url:
155+
wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list')
157156
working_list.update(more_url - existing_urls)
158157

159158
if not contents:
@@ -173,10 +172,12 @@ async def main_process(_sites: set | list):
173172
prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
174173
infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
175174
if infos:
175+
wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb')
176176
await save_to_pb(url, title, infos)
177177
await crawler.close()
178178

179179
if __name__ == '__main__':
180+
180181
sites = pb.read('sites', filter='activated=True')
181182
wiseflow_logger.info('execute all sites one time')
182-
asyncio.run(main_process([site['url'].rstrip('/') for site in sites]))
183+
asyncio.run(main_process([site['url'] for site in sites]))

core/llms/openai_wrapper.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,9 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
3232
resp = response.choices[0].message.content
3333
except Exception as e:
3434
if logger:
35-
logger.warning(f'{e}\nRetrying in 60 second...')
35+
logger.warning(e)
3636
else:
37-
print(f'{e}\nRetrying in 60 second...')
38-
await asyncio.sleep(60)
39-
response = await client.chat.completions.create(messages=messages, model=model, **kwargs)
40-
if response.status_code == 200 and response.choices:
41-
resp = response.choices[0].message.content
42-
else:
43-
if logger:
44-
logger.error(f'after many try, llm error: {response}')
45-
else:
46-
print(f'after many try, llm error: {response}')
37+
print(e)
4738
finally:
4839
semaphore.release()
4940

core/scrapers/mp_scraper.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,16 @@ def process_content(content_div):
183183
if text:
184184
content_parts.append(text)
185185
# 只在块级元素后添加换行符
186-
if element.name in {'div', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
186+
if element.name in {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
187187
content_parts.append('\n')
188+
if element.name in {'div', 'section'}:
189+
content_parts.append('# ')
188190
elif isinstance(element, str):
189191
text = element.strip()
190192
if text:
191193
content_parts.append(text)
192194

193-
return ' '.join(content_parts).strip()
195+
return ''.join(content_parts).strip()
194196

195197
soup = BeautifulSoup(cleaned_html, 'html.parser')
196198

core/tasks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ async def schedule_pipeline(interval):
1515
continue
1616
if counter % site['per_hours'] == 0:
1717
wiseflow_logger.info(f"applying {site['url']}")
18-
todo_urls.add(site['url'].rstrip('/'))
18+
todo_urls.add(site['url'])
1919

2020
counter += 1
2121
await main_process(todo_urls)

test/README.md

+5-13
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
1010

1111
## html 内容解析
1212

13-
[deep_scraper_test.py](./deep_scraper_test.py)
13+
[pre_process_test.py](./pre_process_test.py)
1414

1515
```
16-
python deep_scraper_test.py -F 'json_file_path'
17-
```
18-
19-
## 视觉大模型信息提取
20-
21-
[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
22-
23-
```
24-
python get_visual_info_for_samples.py -F 'json_file_path'
16+
python pre_process_test.py -F 'json_file_path' -R 'record save path'
2517
```
2618

2719
## 大模型信息提取测试
@@ -30,12 +22,12 @@ python get_visual_info_for_samples.py -F 'json_file_path'
3022

3123
- 为测试任务创建 关注点说明,可以参考 [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json),
3224

33-
- 要更改 get_info 的 prompt,请编辑 [prompts.py](./prompts.py)
34-
3525
```
36-
python get_info_test.py -D 'sample dir'
26+
python get_info_test.py -D 'sample dir' -I 'include ap'
3727
```
3828

29+
*-I 是否需要同时测试 llm提取作者和发布时间*
30+
3931
# 结果提交与共享
4032

4133
wiseflow 是一个开源项目,希望通过大家共同的贡献,打造“人人可用的信息爬取工具”!

test/README_EN.md

+5-13
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
1010

1111
## HTML Content Parsing
1212

13-
[deep_scraper_test.py](./deep_scraper_test.py)
13+
[pre_process_test.py](./pre_process_test.py)
1414

1515
```
16-
python deep_scraper_test.py -F 'json_file_path'
17-
```
18-
19-
## Visual Large Model Information Extraction
20-
21-
[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
22-
23-
```
24-
python get_visual_info_for_samples.py -F 'json_file_path'
16+
python pre_process_test.py -F 'json_file_path' -R 'record save path'
2517
```
2618

2719
## Large Model Information Extraction Testing
@@ -30,12 +22,12 @@ python get_visual_info_for_samples.py -F 'json_file_path'
3022

3123
- To create focus point descriptions for test tasks, refer to [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json)
3224

33-
- To modify the prompt for get_info, edit [prompts.py](./prompts.py)
34-
3525
```
36-
python get_info_test.py -D 'sample dir'
26+
python get_info_test.py -D 'sample dir' -I 'include ap'
3727
```
3828

29+
*-I whether to test LLM extraction of author and publish date*
30+
3931
# Result Submission and Sharing
4032

4133
Wiseflow is an open source project aiming to create an "information crawling tool for everyone" through collective contributions!

test/pre_process_test.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ async def main(html_sample, record_file):
8989
base_url = html_sample.get('base', '')
9090
if not base_url:
9191
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
92-
if not base_url.endswith('/'):
93-
base_url = base_url.rsplit('/', 1)[0] + '/'
92+
93+
print('base_url:', base_url)
9494

9595
if not author:
9696
author = html_sample.get('author', '')

0 commit comments

Comments
 (0)