From f0e936ae4372cb829227072b2eb6f64deadefe95 Mon Sep 17 00:00:00 2001 From: Sami Rusani Date: Sun, 26 Apr 2026 23:01:02 +0200 Subject: [PATCH] fix: sample long text for ontology generation --- backend/app/services/ontology_generator.py | 151 +++++++++++++++++++-- backend/tests/test_ontology_generator.py | 52 +++++++ 2 files changed, 194 insertions(+), 9 deletions(-) create mode 100644 backend/tests/test_ontology_generator.py diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py index 01a3d799a5..7619a79f6d 100644 --- a/backend/app/services/ontology_generator.py +++ b/backend/app/services/ontology_generator.py @@ -9,6 +9,7 @@ from typing import Dict, Any, List, Optional from ..utils.llm_client import LLMClient from ..utils.locale import get_language_instruction +from ..utils.file_parser import split_text_into_chunks logger = logging.getLogger(__name__) @@ -227,6 +228,10 @@ def generate( # 传给 LLM 的文本最大长度(5万字) MAX_TEXT_LENGTH_FOR_LLM = 50000 + LONG_TEXT_CHUNK_SIZE = 8000 + LONG_TEXT_CHUNK_OVERLAP = 200 + MAX_LONG_TEXT_CHUNKS = 60 + MIN_LONG_TEXT_EXCERPT = 400 def _build_user_message( self, @@ -236,14 +241,7 @@ def _build_user_message( ) -> str: """构建用户消息""" - # 合并文本 - combined_text = "\n\n---\n\n".join(document_texts) - original_length = len(combined_text) - - # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) - if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM: - combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM] - combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..." + combined_text = self._build_document_context(document_texts) message = f"""## 模拟需求 @@ -273,6 +271,142 @@ def _build_user_message( """ return message + + def _build_document_context(self, document_texts: List[str]) -> str: + """构建用于本体分析的文档上下文,长文本按全局分块抽样而不是只截取开头。""" + + combined_text = "\n\n---\n\n".join(document_texts) + original_length = len(combined_text) + + if original_length <= self.MAX_TEXT_LENGTH_FOR_LLM: + return combined_text + + chunks = self._collect_document_chunks(document_texts) + if not chunks: + return "" + + selected_chunks = self._select_representative_chunks(chunks) + excerpt_budget = self._calculate_excerpt_budget(len(selected_chunks)) + context = self._render_chunked_context( + selected_chunks=selected_chunks, + original_length=original_length, + total_chunks=len(chunks), + excerpt_limit=excerpt_budget, + ) + + while len(context) > self.MAX_TEXT_LENGTH_FOR_LLM and excerpt_budget > self.MIN_LONG_TEXT_EXCERPT: + excerpt_budget = max(self.MIN_LONG_TEXT_EXCERPT, int(excerpt_budget * 0.85)) + context = self._render_chunked_context( + selected_chunks=selected_chunks, + original_length=original_length, + total_chunks=len(chunks), + excerpt_limit=excerpt_budget, + ) + + if len(context) > self.MAX_TEXT_LENGTH_FOR_LLM: + marker = "\n\n...(分块上下文已压缩到本体分析长度限制内)..." + context = context[:self.MAX_TEXT_LENGTH_FOR_LLM - len(marker)] + marker + + return context + + def _collect_document_chunks(self, document_texts: List[str]) -> List[Dict[str, Any]]: + """按文档收集分块,保留文档和分块编号方便提示词定位。""" + + all_chunks: List[Dict[str, Any]] = [] + for doc_index, text in enumerate(document_texts, 1): + doc_chunks = split_text_into_chunks( + text, + chunk_size=self.LONG_TEXT_CHUNK_SIZE, + overlap=self.LONG_TEXT_CHUNK_OVERLAP, + ) + total_doc_chunks = len(doc_chunks) + for chunk_index, chunk in enumerate(doc_chunks, 1): + all_chunks.append({ + "document_index": doc_index, + "chunk_index": chunk_index, + "total_document_chunks": total_doc_chunks, + "text": chunk, + }) + + return all_chunks + + def _select_representative_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """从全部分块中等距抽样,覆盖长文开头、中段和结尾。""" + + if len(chunks) <= self.MAX_LONG_TEXT_CHUNKS: + return chunks + + if self.MAX_LONG_TEXT_CHUNKS <= 1: + return [chunks[0]] + + last_index = len(chunks) - 1 + selected_indexes = { + round(i * last_index / (self.MAX_LONG_TEXT_CHUNKS - 1)) + for i in range(self.MAX_LONG_TEXT_CHUNKS) + } + return [chunks[i] for i in sorted(selected_indexes)] + + def _calculate_excerpt_budget(self, selected_count: int) -> int: + """根据选中的分块数量为每块分配字符预算。""" + + header_budget = 600 + chunk_header_budget = 120 * selected_count + available = max( + self.MIN_LONG_TEXT_EXCERPT * selected_count, + self.MAX_TEXT_LENGTH_FOR_LLM - header_budget - chunk_header_budget, + ) + return max(self.MIN_LONG_TEXT_EXCERPT, available // max(selected_count, 1)) + + def _render_chunked_context( + self, + selected_chunks: List[Dict[str, Any]], + original_length: int, + total_chunks: int, + excerpt_limit: int, + ) -> str: + """渲染长文本分块上下文。""" + + lines = [ + ( + f"【长文本自动分块摘要】原文共{original_length}字," + f"已分为{total_chunks}个文本块用于全局覆盖分析。" + ), + ( + f"以下展示其中{len(selected_chunks)}个代表性文本块的摘录," + "覆盖开头、中段和结尾;请基于这些跨全文线索设计本体,不要只依赖第一段内容。" + ), + ] + + for chunk in selected_chunks: + excerpt = self._excerpt_text(chunk["text"], excerpt_limit) + lines.append( + "\n".join([ + ( + f"--- 文档 {chunk['document_index']} / " + f"分块 {chunk['chunk_index']}/{chunk['total_document_chunks']} ---" + ), + excerpt, + ]) + ) + + return "\n\n".join(lines) + + @staticmethod + def _excerpt_text(text: str, char_limit: int) -> str: + """长分块保留首尾,避免每个分块内部再次变成只看开头。""" + + text = text.strip() + if len(text) <= char_limit: + return text + + marker = "\n...(本分块中间内容省略)...\n" + if char_limit <= len(marker) + 20: + return text[:char_limit] + + remaining = char_limit - len(marker) + head_len = remaining // 2 + tail_len = remaining - head_len + return f"{text[:head_len].rstrip()}{marker}{text[-tail_len:].lstrip()}" def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: """验证和后处理结果""" @@ -503,4 +637,3 @@ def generate_python_code(self, ontology: Dict[str, Any]) -> str: code_lines.append('}') return '\n'.join(code_lines) - diff --git a/backend/tests/test_ontology_generator.py b/backend/tests/test_ontology_generator.py new file mode 100644 index 0000000000..4d81d2ff8c --- /dev/null +++ b/backend/tests/test_ontology_generator.py @@ -0,0 +1,52 @@ +from app.services.ontology_generator import OntologyGenerator + + +def _generator_for_test() -> OntologyGenerator: + generator = OntologyGenerator(llm_client=object()) + generator.MAX_TEXT_LENGTH_FOR_LLM = 2000 + generator.LONG_TEXT_CHUNK_SIZE = 500 + generator.LONG_TEXT_CHUNK_OVERLAP = 0 + generator.MAX_LONG_TEXT_CHUNKS = 3 + generator.MIN_LONG_TEXT_EXCERPT = 120 + return generator + + +def test_short_ontology_context_keeps_original_text(): + generator = _generator_for_test() + + context = generator._build_document_context(["short document body"]) + + assert context == "short document body" + assert "长文本自动分块摘要" not in context + + +def test_long_ontology_context_samples_across_document(): + generator = _generator_for_test() + long_text = "BEGIN" + ("a" * 1050) + "MIDDLE" + ("b" * 1050) + "END" + + context = generator._build_document_context([long_text]) + + assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM + assert "长文本自动分块摘要" in context + assert "BEGIN" in context + assert "MIDDLE" in context + assert "END" in context + assert "分块 1/" in context + assert "分块 3/" in context + assert "分块 5/" in context + + +def test_very_long_ontology_context_selects_representative_chunks(): + generator = _generator_for_test() + chunks = ["BEGIN"] + [ + f"CHUNK{i:02d}-" + (str(i) * 490) + for i in range(12) + ] + ["FINALEND"] + long_text = "".join(chunks) + + context = generator._build_document_context([long_text]) + + assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM + assert "BEGIN" in context + assert "FINALEND" in context + assert context.count("--- 文档 1 / 分块") == generator.MAX_LONG_TEXT_CHUNKS