From 3e07d637574ac87e26250a9151b5ccd447b0b10b Mon Sep 17 00:00:00 2001 From: bigbrother666sh Date: Sat, 18 Jan 2025 13:47:47 +0800 Subject: [PATCH] llm wrapper and prompt opz, base url bug, mp scraper opz --- core/agents/get_info.py | 6 +++--- core/agents/get_info_prompts.py | 8 ++++---- core/general_process.py | 19 ++++++++++--------- core/llms/openai_wrapper.py | 13 ++----------- core/scrapers/mp_scraper.py | 6 ++++-- core/tasks.py | 2 +- test/README.md | 18 +++++------------- test/README_EN.md | 18 +++++------------- test/pre_process_test.py | 4 ++-- 9 files changed, 36 insertions(+), 58 deletions(-) diff --git a/core/agents/get_info.py b/core/agents/get_info.py index 9974749..1f5e033 100644 --- a/core/agents/get_info.py +++ b/core/agents/get_info.py @@ -265,9 +265,9 @@ async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list for link in links: if link not in text_batch: if _logger: - _logger.warning(f"model generating hallucination:\n{result[-1]}") + _logger.warning(f"model generating hallucination:\n{link}\n{result[-1]}\n{text_batch}") if test_mode: - print(f"model hallucination:\n{result[-1]}") + print(f"model hallucination:\n{link}\n{result[-1]}\n{text_batch}") continue cache.add(link) text_batch = '' @@ -343,5 +343,5 @@ async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_ url_tags = re.findall(r'\[\d+\]', content) refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict} final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences}) - + return final diff --git a/core/agents/get_info_prompts.py b/core/agents/get_info_prompts.py index f4c917b..9e8f433 100644 --- a/core/agents/get_info_prompts.py +++ b/core/agents/get_info_prompts.py @@ -36,8 +36,8 @@ {focus_statement}\n 在提炼摘要时,请遵循以下原则: - 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围 -- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文) -- 摘要信息务必忠于原文''' +- 摘要应当详实、充分,且绝对忠于原文 +- 如果摘要涉及的原文片段中包含类似"[3]"这样的引用标记,务必在摘要中保留相关标记''' get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: """ @@ -55,8 +55,8 @@ {focus_statement}\n When extracting summaries, please follow these principles: - Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any) -- The summary should be detailed and comprehensive -- The summary should be faithful to the original text''' +- The summary should be detailed and comprehensive and absolutely faithful to the original text +- If the summary involves a reference marker like "[3]", it must be retained in the summary''' get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format: """ diff --git a/core/general_process.py b/core/general_process.py index 58f0957..922de62 100644 --- a/core/general_process.py +++ b/core/general_process.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os from utils.pb_api import PbTalker from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese from agents.get_info import * @@ -87,9 +88,10 @@ async def main_process(_sites: set | list): while working_list: url = working_list.pop() existing_urls.add(url) + wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list') has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts) if has_common_ext: - wiseflow_logger.info(f'{url} is a common file, skip') + wiseflow_logger.debug(f'{url} is a common file, skip') continue parsed_url = urlparse(url) @@ -125,7 +127,6 @@ async def main_process(_sites: set | list): base_url = '' author = '' publish_date = '' - if not raw_markdown: wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip') continue @@ -136,17 +137,14 @@ async def main_process(_sites: set | list): base_url = metadata_dict.get('base', '') if not base_url: base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" - if not base_url.endswith('/'): - # 如果路径不以 / 结尾,则去掉最后一个路径段 - base_url = base_url.rsplit('/', 1)[0] + '/' - + if not author: author = metadata_dict.get('author', '') if not publish_date: publish_date = metadata_dict.get('publish_date', '') - + link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls) - + if link_dict and links_parts: prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model] links_texts = [] @@ -154,6 +152,7 @@ async def main_process(_sites: set | list): links_texts.extend(_parts.split('\n\n')) more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger) if more_url: + wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list') working_list.update(more_url - existing_urls) if not contents: @@ -173,10 +172,12 @@ async def main_process(_sites: set | list): prompts = [get_info_sys_prompt, get_info_suffix_prompt, model] infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger) if infos: + wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb') await save_to_pb(url, title, infos) await crawler.close() if __name__ == '__main__': + sites = pb.read('sites', filter='activated=True') wiseflow_logger.info('execute all sites one time') - asyncio.run(main_process([site['url'].rstrip('/') for site in sites])) + asyncio.run(main_process([site['url'] for site in sites])) diff --git a/core/llms/openai_wrapper.py b/core/llms/openai_wrapper.py index 75dd403..2eb4063 100644 --- a/core/llms/openai_wrapper.py +++ b/core/llms/openai_wrapper.py @@ -32,18 +32,9 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: resp = response.choices[0].message.content except Exception as e: if logger: - logger.warning(f'{e}\nRetrying in 60 second...') + logger.warning(e) else: - print(f'{e}\nRetrying in 60 second...') - await asyncio.sleep(60) - response = await client.chat.completions.create(messages=messages, model=model, **kwargs) - if response.status_code == 200 and response.choices: - resp = response.choices[0].message.content - else: - if logger: - logger.error(f'after many try, llm error: {response}') - else: - print(f'after many try, llm error: {response}') + print(e) finally: semaphore.release() diff --git a/core/scrapers/mp_scraper.py b/core/scrapers/mp_scraper.py index 0dfff67..649e69d 100644 --- a/core/scrapers/mp_scraper.py +++ b/core/scrapers/mp_scraper.py @@ -183,14 +183,16 @@ def process_content(content_div): if text: content_parts.append(text) # 只在块级元素后添加换行符 - if element.name in {'div', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: + if element.name in {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: content_parts.append('\n') + if element.name in {'div', 'section'}: + content_parts.append('# ') elif isinstance(element, str): text = element.strip() if text: content_parts.append(text) - return ' '.join(content_parts).strip() + return ''.join(content_parts).strip() soup = BeautifulSoup(cleaned_html, 'html.parser') diff --git a/core/tasks.py b/core/tasks.py index f1338c7..573c882 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -15,7 +15,7 @@ async def schedule_pipeline(interval): continue if counter % site['per_hours'] == 0: wiseflow_logger.info(f"applying {site['url']}") - todo_urls.add(site['url'].rstrip('/')) + todo_urls.add(site['url']) counter += 1 await main_process(todo_urls) diff --git a/test/README.md b/test/README.md index a6dcd26..975bbff 100644 --- a/test/README.md +++ b/test/README.md @@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...' ## html 内容解析 -[deep_scraper_test.py](./deep_scraper_test.py) +[pre_process_test.py](./pre_process_test.py) ``` -python deep_scraper_test.py -F 'json_file_path' -``` - -## 视觉大模型信息提取 - -[get_visual_info_for_samples.py](./get_visual_info_for_samples.py) - -``` -python get_visual_info_for_samples.py -F 'json_file_path' +python pre_process_test.py -F 'json_file_path' -R 'record save path' ``` ## 大模型信息提取测试 @@ -30,12 +22,12 @@ python get_visual_info_for_samples.py -F 'json_file_path' - 为测试任务创建 关注点说明,可以参考 [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json), - - 要更改 get_info 的 prompt,请编辑 [prompts.py](./prompts.py) - ``` -python get_info_test.py -D 'sample dir' +python get_info_test.py -D 'sample dir' -I 'include ap' ``` +*-I 是否需要同时测试 llm提取作者和发布时间* + # 结果提交与共享 wiseflow 是一个开源项目,希望通过大家共同的贡献,打造“人人可用的信息爬取工具”! diff --git a/test/README_EN.md b/test/README_EN.md index c6e9675..d8dcabf 100644 --- a/test/README_EN.md +++ b/test/README_EN.md @@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...' ## HTML Content Parsing -[deep_scraper_test.py](./deep_scraper_test.py) +[pre_process_test.py](./pre_process_test.py) ``` -python deep_scraper_test.py -F 'json_file_path' -``` - -## Visual Large Model Information Extraction - -[get_visual_info_for_samples.py](./get_visual_info_for_samples.py) - -``` -python get_visual_info_for_samples.py -F 'json_file_path' +python pre_process_test.py -F 'json_file_path' -R 'record save path' ``` ## Large Model Information Extraction Testing @@ -30,12 +22,12 @@ python get_visual_info_for_samples.py -F 'json_file_path' - To create focus point descriptions for test tasks, refer to [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json) - - To modify the prompt for get_info, edit [prompts.py](./prompts.py) - ``` -python get_info_test.py -D 'sample dir' +python get_info_test.py -D 'sample dir' -I 'include ap' ``` +*-I whether to test LLM extraction of author and publish date* + # Result Submission and Sharing Wiseflow is an open source project aiming to create an "information crawling tool for everyone" through collective contributions! diff --git a/test/pre_process_test.py b/test/pre_process_test.py index cb00473..bf969f8 100644 --- a/test/pre_process_test.py +++ b/test/pre_process_test.py @@ -89,8 +89,8 @@ async def main(html_sample, record_file): base_url = html_sample.get('base', '') if not base_url: base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" - if not base_url.endswith('/'): - base_url = base_url.rsplit('/', 1)[0] + '/' + + print('base_url:', base_url) if not author: author = html_sample.get('author', '')