From 3e07d637574ac87e26250a9151b5ccd447b0b10b Mon Sep 17 00:00:00 2001
From: bigbrother666sh <zeming.zhao@gmail.com>
Date: Sat, 18 Jan 2025 13:47:47 +0800
Subject: [PATCH] llm wrapper and prompt opz, base url bug, mp scraper opz

---
 core/agents/get_info.py         |  6 +++---
 core/agents/get_info_prompts.py |  8 ++++----
 core/general_process.py         | 19 ++++++++++---------
 core/llms/openai_wrapper.py     | 13 ++-----------
 core/scrapers/mp_scraper.py     |  6 ++++--
 core/tasks.py                   |  2 +-
 test/README.md                  | 18 +++++-------------
 test/README_EN.md               | 18 +++++-------------
 test/pre_process_test.py        |  4 ++--
 9 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/core/agents/get_info.py b/core/agents/get_info.py
index 9974749..1f5e033 100644
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@@ -265,9 +265,9 @@ async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list
                 for link in links:
                     if link not in text_batch:
                         if _logger:
-                            _logger.warning(f"model generating hallucination:\n{result[-1]}")
+                            _logger.warning(f"model generating hallucination:\n{link}\n{result[-1]}\n{text_batch}")
                         if test_mode:
-                            print(f"model hallucination:\n{result[-1]}")
+                            print(f"model hallucination:\n{link}\n{result[-1]}\n{text_batch}")
                         continue
                     cache.add(link)
             text_batch = ''
@@ -343,5 +343,5 @@ async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_
             url_tags = re.findall(r'\[\d+\]', content)
             refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict}
             final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
-        
+    
     return final
diff --git a/core/agents/get_info_prompts.py b/core/agents/get_info_prompts.py
index f4c917b..9e8f433 100644
--- a/core/agents/get_info_prompts.py
+++ b/core/agents/get_info_prompts.py
@@ -36,8 +36,8 @@
 {focus_statement}\n
 在提炼摘要时，请遵循以下原则：
 - 理解每个关注点的含义以及进一步的解释（如有），确保摘要与关注点强相关并符合解释（如有）的范围
-- 摘要应当详实、充分，使用简体中文（如果原文是英文，请翻译成简体中文）
-- 摘要信息务必忠于原文'''
+- 摘要应当详实、充分，且绝对忠于原文
+- 如果摘要涉及的原文片段中包含类似"[3]"这样的引用标记，务必在摘要中保留相关标记'''
 
 get_info_suffix = '''请对关注点逐一生成摘要，不要遗漏任何关注点，如果网页文本与关注点无关，可以对应输出"NA"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
 """
@@ -55,8 +55,8 @@
 {focus_statement}\n
 When extracting summaries, please follow these principles:
 - Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
-- The summary should be detailed and comprehensive
-- The summary should be faithful to the original text'''
+- The summary should be detailed and comprehensive and absolutely faithful to the original text
+- If the summary involves a reference marker like "[3]", it must be retained in the summary'''
 
 get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
 """
diff --git a/core/general_process.py b/core/general_process.py
index 58f0957..922de62 100644
--- a/core/general_process.py
+++ b/core/general_process.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import os
 from utils.pb_api import PbTalker
 from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
 from agents.get_info import *
@@ -87,9 +88,10 @@ async def main_process(_sites: set | list):
     while working_list:
         url = working_list.pop()
         existing_urls.add(url)
+        wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list')
         has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
         if has_common_ext:
-            wiseflow_logger.info(f'{url} is a common file, skip')
+            wiseflow_logger.debug(f'{url} is a common file, skip')
             continue
 
         parsed_url = urlparse(url)
@@ -125,7 +127,6 @@ async def main_process(_sites: set | list):
             base_url = ''
             author = ''
             publish_date = ''
-
         if not raw_markdown:
             wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
             continue
@@ -136,17 +137,14 @@ async def main_process(_sites: set | list):
             base_url = metadata_dict.get('base', '')
         if not base_url:
             base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
-        if not base_url.endswith('/'):
-            # 如果路径不以 / 结尾，则去掉最后一个路径段
-            base_url = base_url.rsplit('/', 1)[0] + '/'
-            
+
         if not author:
             author = metadata_dict.get('author', '')
         if not publish_date:
             publish_date = metadata_dict.get('publish_date', '')
-            
+
         link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
-            
+
         if link_dict and links_parts:
             prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
             links_texts = []
@@ -154,6 +152,7 @@ async def main_process(_sites: set | list):
                 links_texts.extend(_parts.split('\n\n'))
             more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
             if more_url:
+                wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list')
                 working_list.update(more_url - existing_urls)
             
         if not contents:
@@ -173,10 +172,12 @@ async def main_process(_sites: set | list):
         prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
         infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
         if infos:
+            wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb')
             await save_to_pb(url, title, infos)
     await crawler.close()
 
 if __name__ == '__main__':
+
     sites = pb.read('sites', filter='activated=True')
     wiseflow_logger.info('execute all sites one time')
-    asyncio.run(main_process([site['url'].rstrip('/') for site in sites]))
+    asyncio.run(main_process([site['url'] for site in sites]))
diff --git a/core/llms/openai_wrapper.py b/core/llms/openai_wrapper.py
index 75dd403..2eb4063 100644
--- a/core/llms/openai_wrapper.py
+++ b/core/llms/openai_wrapper.py
@@ -32,18 +32,9 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
         resp = response.choices[0].message.content
     except Exception as e:
         if logger:
-            logger.warning(f'{e}\nRetrying in 60 second...')
+            logger.warning(e)
         else:
-            print(f'{e}\nRetrying in 60 second...')
-        await asyncio.sleep(60)
-        response = await client.chat.completions.create(messages=messages, model=model, **kwargs)
-        if response.status_code == 200 and response.choices:
-            resp = response.choices[0].message.content
-        else:
-            if logger:
-                logger.error(f'after many try, llm error: {response}')
-            else:
-                print(f'after many try, llm error: {response}')
+            print(e)
     finally:
         semaphore.release()
 
diff --git a/core/scrapers/mp_scraper.py b/core/scrapers/mp_scraper.py
index 0dfff67..649e69d 100644
--- a/core/scrapers/mp_scraper.py
+++ b/core/scrapers/mp_scraper.py
@@ -183,14 +183,16 @@ def process_content(content_div):
                 if text:
                     content_parts.append(text)
                 # 只在块级元素后添加换行符
-                if element.name in {'div', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
+                if element.name in {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
                     content_parts.append('\n')
+                if element.name in {'div', 'section'}:
+                    content_parts.append('# ')
             elif isinstance(element, str):
                 text = element.strip()
                 if text:
                     content_parts.append(text)
     
-        return '  '.join(content_parts).strip()
+        return ''.join(content_parts).strip()
 
     soup = BeautifulSoup(cleaned_html, 'html.parser')
 
diff --git a/core/tasks.py b/core/tasks.py
index f1338c7..573c882 100644
--- a/core/tasks.py
+++ b/core/tasks.py
@@ -15,7 +15,7 @@ async def schedule_pipeline(interval):
                 continue
             if counter % site['per_hours'] == 0:
                 wiseflow_logger.info(f"applying {site['url']}")
-                todo_urls.add(site['url'].rstrip('/'))
+                todo_urls.add(site['url'])
 
         counter += 1
         await main_process(todo_urls)
diff --git a/test/README.md b/test/README.md
index a6dcd26..975bbff 100644
--- a/test/README.md
+++ b/test/README.md
@@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
 
 ## html 内容解析
 
-[deep_scraper_test.py](./deep_scraper_test.py)
+[pre_process_test.py](./pre_process_test.py)
 
 ```
-python deep_scraper_test.py -F 'json_file_path'
-```
-
-## 视觉大模型信息提取
-
-[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
-
-```
-python get_visual_info_for_samples.py -F 'json_file_path'
+python pre_process_test.py -F 'json_file_path' -R 'record save path'
 ```
 
 ## 大模型信息提取测试
@@ -30,12 +22,12 @@ python get_visual_info_for_samples.py -F 'json_file_path'
 
     - 为测试任务创建 关注点说明，可以参考 [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json),
 
-    - 要更改 get_info 的 prompt，请编辑 [prompts.py](./prompts.py)
-
 ```
-python get_info_test.py -D 'sample dir'
+python get_info_test.py -D 'sample dir' -I 'include ap'
 ```
 
+*-I 是否需要同时测试 llm提取作者和发布时间*
+
 # 结果提交与共享
 
 wiseflow 是一个开源项目，希望通过大家共同的贡献，打造“人人可用的信息爬取工具”！
diff --git a/test/README_EN.md b/test/README_EN.md
index c6e9675..d8dcabf 100644
--- a/test/README_EN.md
+++ b/test/README_EN.md
@@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
 
 ## HTML Content Parsing
 
-[deep_scraper_test.py](./deep_scraper_test.py)
+[pre_process_test.py](./pre_process_test.py)
 
 ```
-python deep_scraper_test.py -F 'json_file_path'
-```
-
-## Visual Large Model Information Extraction
-
-[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
-
-```
-python get_visual_info_for_samples.py -F 'json_file_path'
+python pre_process_test.py -F 'json_file_path' -R 'record save path'
 ```
 
 ## Large Model Information Extraction Testing
@@ -30,12 +22,12 @@ python get_visual_info_for_samples.py -F 'json_file_path'
 
     - To create focus point descriptions for test tasks, refer to [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json)
 
-    - To modify the prompt for get_info, edit [prompts.py](./prompts.py)
-
 ```
-python get_info_test.py -D 'sample dir'
+python get_info_test.py -D 'sample dir' -I 'include ap'
 ```
 
+*-I whether to test LLM extraction of author and publish date*
+
 # Result Submission and Sharing
 
 Wiseflow is an open source project aiming to create an "information crawling tool for everyone" through collective contributions!
diff --git a/test/pre_process_test.py b/test/pre_process_test.py
index cb00473..bf969f8 100644
--- a/test/pre_process_test.py
+++ b/test/pre_process_test.py
@@ -89,8 +89,8 @@ async def main(html_sample, record_file):
         base_url = html_sample.get('base', '')
     if not base_url:
         base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
-    if not base_url.endswith('/'):
-        base_url = base_url.rsplit('/', 1)[0] + '/'
+
+    print('base_url:', base_url)
             
     if not author:
         author = html_sample.get('author', '')