From f0e936ae4372cb829227072b2eb6f64deadefe95 Mon Sep 17 00:00:00 2001
From: Sami Rusani <sr@samirusani>
Date: Sun, 26 Apr 2026 23:01:02 +0200
Subject: [PATCH] fix: sample long text for ontology generation

---
 backend/app/services/ontology_generator.py | 151 +++++++++++++++++++--
 backend/tests/test_ontology_generator.py   |  52 +++++++
 2 files changed, 194 insertions(+), 9 deletions(-)
 create mode 100644 backend/tests/test_ontology_generator.py

diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py
index 01a3d799a5..7619a79f6d 100644
--- a/backend/app/services/ontology_generator.py
+++ b/backend/app/services/ontology_generator.py
@@ -9,6 +9,7 @@
 from typing import Dict, Any, List, Optional
 from ..utils.llm_client import LLMClient
 from ..utils.locale import get_language_instruction
+from ..utils.file_parser import split_text_into_chunks
 
 logger = logging.getLogger(__name__)
 
@@ -227,6 +228,10 @@ def generate(
     
     # 传给 LLM 的文本最大长度（5万字）
     MAX_TEXT_LENGTH_FOR_LLM = 50000
+    LONG_TEXT_CHUNK_SIZE = 8000
+    LONG_TEXT_CHUNK_OVERLAP = 200
+    MAX_LONG_TEXT_CHUNKS = 60
+    MIN_LONG_TEXT_EXCERPT = 400
     
     def _build_user_message(
         self,
@@ -236,14 +241,7 @@ def _build_user_message(
     ) -> str:
         """构建用户消息"""
         
-        # 合并文本
-        combined_text = "\n\n---\n\n".join(document_texts)
-        original_length = len(combined_text)
-        
-        # 如果文本超过5万字，截断（仅影响传给LLM的内容，不影响图谱构建）
-        if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
-            combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
-            combined_text += f"\n\n...(原文共{original_length}字，已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
+        combined_text = self._build_document_context(document_texts)
         
         message = f"""## 模拟需求
 
@@ -273,6 +271,142 @@ def _build_user_message(
 """
         
         return message
+
+    def _build_document_context(self, document_texts: List[str]) -> str:
+        """构建用于本体分析的文档上下文，长文本按全局分块抽样而不是只截取开头。"""
+
+        combined_text = "\n\n---\n\n".join(document_texts)
+        original_length = len(combined_text)
+
+        if original_length <= self.MAX_TEXT_LENGTH_FOR_LLM:
+            return combined_text
+
+        chunks = self._collect_document_chunks(document_texts)
+        if not chunks:
+            return ""
+
+        selected_chunks = self._select_representative_chunks(chunks)
+        excerpt_budget = self._calculate_excerpt_budget(len(selected_chunks))
+        context = self._render_chunked_context(
+            selected_chunks=selected_chunks,
+            original_length=original_length,
+            total_chunks=len(chunks),
+            excerpt_limit=excerpt_budget,
+        )
+
+        while len(context) > self.MAX_TEXT_LENGTH_FOR_LLM and excerpt_budget > self.MIN_LONG_TEXT_EXCERPT:
+            excerpt_budget = max(self.MIN_LONG_TEXT_EXCERPT, int(excerpt_budget * 0.85))
+            context = self._render_chunked_context(
+                selected_chunks=selected_chunks,
+                original_length=original_length,
+                total_chunks=len(chunks),
+                excerpt_limit=excerpt_budget,
+            )
+
+        if len(context) > self.MAX_TEXT_LENGTH_FOR_LLM:
+            marker = "\n\n...(分块上下文已压缩到本体分析长度限制内)..."
+            context = context[:self.MAX_TEXT_LENGTH_FOR_LLM - len(marker)] + marker
+
+        return context
+
+    def _collect_document_chunks(self, document_texts: List[str]) -> List[Dict[str, Any]]:
+        """按文档收集分块，保留文档和分块编号方便提示词定位。"""
+
+        all_chunks: List[Dict[str, Any]] = []
+        for doc_index, text in enumerate(document_texts, 1):
+            doc_chunks = split_text_into_chunks(
+                text,
+                chunk_size=self.LONG_TEXT_CHUNK_SIZE,
+                overlap=self.LONG_TEXT_CHUNK_OVERLAP,
+            )
+            total_doc_chunks = len(doc_chunks)
+            for chunk_index, chunk in enumerate(doc_chunks, 1):
+                all_chunks.append({
+                    "document_index": doc_index,
+                    "chunk_index": chunk_index,
+                    "total_document_chunks": total_doc_chunks,
+                    "text": chunk,
+                })
+
+        return all_chunks
+
+    def _select_representative_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """从全部分块中等距抽样，覆盖长文开头、中段和结尾。"""
+
+        if len(chunks) <= self.MAX_LONG_TEXT_CHUNKS:
+            return chunks
+
+        if self.MAX_LONG_TEXT_CHUNKS <= 1:
+            return [chunks[0]]
+
+        last_index = len(chunks) - 1
+        selected_indexes = {
+            round(i * last_index / (self.MAX_LONG_TEXT_CHUNKS - 1))
+            for i in range(self.MAX_LONG_TEXT_CHUNKS)
+        }
+        return [chunks[i] for i in sorted(selected_indexes)]
+
+    def _calculate_excerpt_budget(self, selected_count: int) -> int:
+        """根据选中的分块数量为每块分配字符预算。"""
+
+        header_budget = 600
+        chunk_header_budget = 120 * selected_count
+        available = max(
+            self.MIN_LONG_TEXT_EXCERPT * selected_count,
+            self.MAX_TEXT_LENGTH_FOR_LLM - header_budget - chunk_header_budget,
+        )
+        return max(self.MIN_LONG_TEXT_EXCERPT, available // max(selected_count, 1))
+
+    def _render_chunked_context(
+        self,
+        selected_chunks: List[Dict[str, Any]],
+        original_length: int,
+        total_chunks: int,
+        excerpt_limit: int,
+    ) -> str:
+        """渲染长文本分块上下文。"""
+
+        lines = [
+            (
+                f"【长文本自动分块摘要】原文共{original_length}字，"
+                f"已分为{total_chunks}个文本块用于全局覆盖分析。"
+            ),
+            (
+                f"以下展示其中{len(selected_chunks)}个代表性文本块的摘录，"
+                "覆盖开头、中段和结尾；请基于这些跨全文线索设计本体，不要只依赖第一段内容。"
+            ),
+        ]
+
+        for chunk in selected_chunks:
+            excerpt = self._excerpt_text(chunk["text"], excerpt_limit)
+            lines.append(
+                "\n".join([
+                    (
+                        f"--- 文档 {chunk['document_index']} / "
+                        f"分块 {chunk['chunk_index']}/{chunk['total_document_chunks']} ---"
+                    ),
+                    excerpt,
+                ])
+            )
+
+        return "\n\n".join(lines)
+
+    @staticmethod
+    def _excerpt_text(text: str, char_limit: int) -> str:
+        """长分块保留首尾，避免每个分块内部再次变成只看开头。"""
+
+        text = text.strip()
+        if len(text) <= char_limit:
+            return text
+
+        marker = "\n...(本分块中间内容省略)...\n"
+        if char_limit <= len(marker) + 20:
+            return text[:char_limit]
+
+        remaining = char_limit - len(marker)
+        head_len = remaining // 2
+        tail_len = remaining - head_len
+        return f"{text[:head_len].rstrip()}{marker}{text[-tail_len:].lstrip()}"
     
     def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
         """验证和后处理结果"""
@@ -503,4 +637,3 @@ def generate_python_code(self, ontology: Dict[str, Any]) -> str:
         code_lines.append('}')
         
         return '\n'.join(code_lines)
-
diff --git a/backend/tests/test_ontology_generator.py b/backend/tests/test_ontology_generator.py
new file mode 100644
index 0000000000..4d81d2ff8c
--- /dev/null
+++ b/backend/tests/test_ontology_generator.py
@@ -0,0 +1,52 @@
+from app.services.ontology_generator import OntologyGenerator
+
+
+def _generator_for_test() -> OntologyGenerator:
+    generator = OntologyGenerator(llm_client=object())
+    generator.MAX_TEXT_LENGTH_FOR_LLM = 2000
+    generator.LONG_TEXT_CHUNK_SIZE = 500
+    generator.LONG_TEXT_CHUNK_OVERLAP = 0
+    generator.MAX_LONG_TEXT_CHUNKS = 3
+    generator.MIN_LONG_TEXT_EXCERPT = 120
+    return generator
+
+
+def test_short_ontology_context_keeps_original_text():
+    generator = _generator_for_test()
+
+    context = generator._build_document_context(["short document body"])
+
+    assert context == "short document body"
+    assert "长文本自动分块摘要" not in context
+
+
+def test_long_ontology_context_samples_across_document():
+    generator = _generator_for_test()
+    long_text = "BEGIN" + ("a" * 1050) + "MIDDLE" + ("b" * 1050) + "END"
+
+    context = generator._build_document_context([long_text])
+
+    assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
+    assert "长文本自动分块摘要" in context
+    assert "BEGIN" in context
+    assert "MIDDLE" in context
+    assert "END" in context
+    assert "分块 1/" in context
+    assert "分块 3/" in context
+    assert "分块 5/" in context
+
+
+def test_very_long_ontology_context_selects_representative_chunks():
+    generator = _generator_for_test()
+    chunks = ["BEGIN"] + [
+        f"CHUNK{i:02d}-" + (str(i) * 490)
+        for i in range(12)
+    ] + ["FINALEND"]
+    long_text = "".join(chunks)
+
+    context = generator._build_document_context([long_text])
+
+    assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
+    assert "BEGIN" in context
+    assert "FINALEND" in context
+    assert context.count("--- 文档 1 / 分块") == generator.MAX_LONG_TEXT_CHUNKS