11from pathlib import Path
22from types import SimpleNamespace
33
4+ import asyncio
45import numpy as np
56import pytest
67
@@ -18,6 +19,8 @@ def __init__(self) -> None:
1819 self .paragraphs : list [dict [str , object ]] = []
1920 self .entities : list [str ] = []
2021 self .relations : list [tuple [str , str , str ]] = []
22+ self .paragraph_backfills : list [tuple [str , str ]] = []
23+ self .relation_vector_states : list [tuple [str , str , str | None , bool ]] = []
2124
2225 def add_paragraph (self , ** kwargs ):
2326 self .paragraphs .append (dict (kwargs ))
@@ -33,8 +36,17 @@ def add_relation(self, *, subject: str, predicate: str, obj: str, **kwargs) -> s
3336 self .relations .append ((subject , predicate , obj ))
3437 return f"relation-{ len (self .relations )} "
3538
36- def set_relation_vector_state (self , rel_hash : str , state : str ) -> None :
37- del rel_hash , state
39+ def set_relation_vector_state (
40+ self ,
41+ rel_hash : str ,
42+ state : str ,
43+ error : str | None = None ,
44+ bump_retry : bool = False ,
45+ ) -> None :
46+ self .relation_vector_states .append ((rel_hash , state , error , bump_retry ))
47+
48+ def enqueue_paragraph_vector_backfill (self , paragraph_hash : str , * , error : str = "" ) -> None :
49+ self .paragraph_backfills .append ((paragraph_hash , error ))
3850
3951 def get_live_paragraphs_by_source (self , source : str ):
4052 return [
@@ -58,32 +70,60 @@ def add_edges(self, edges, relation_hashes=None):
5870
5971
6072class _DummyVectorStore :
73+ def __init__ (self ) -> None :
74+ self .ids : list [str ] = []
75+
6176 def __contains__ (self , item : str ) -> bool :
62- del item
63- return False
77+ return item in self .ids
6478
6579 def add (self , vectors , ids ):
66- del vectors , ids
80+ del vectors
81+ self .ids .extend (list (ids ))
6782
6883
6984class _DummyEmbeddingManager :
85+ def __init__ (self , * , delay : float = 0.0 , fail_for : str = "" ) -> None :
86+ self .delay = delay
87+ self .fail_for = fail_for
88+ self .inflight = 0
89+ self .max_inflight = 0
90+ self .calls : list [str ] = []
91+
7092 async def encode (self , text : str ) -> np .ndarray :
71- del text
93+ self .calls .append (text )
94+ self .inflight += 1
95+ self .max_inflight = max (self .max_inflight , self .inflight )
96+ try :
97+ if self .delay :
98+ await asyncio .sleep (self .delay )
99+ if self .fail_for and self .fail_for in text :
100+ raise RuntimeError ("embedding failed" )
101+ finally :
102+ self .inflight -= 1
72103 return np .ones (4 , dtype = np .float32 )
73104
74105
75- def _build_manager () -> tuple [ImportTaskManager , _DummyMetadataStore ]:
106+ def _build_manager (
107+ * ,
108+ embedding_manager : _DummyEmbeddingManager | None = None ,
109+ relation_vectorization_enabled : bool = False ,
110+ ) -> tuple [ImportTaskManager , _DummyMetadataStore ]:
76111 metadata_store = _DummyMetadataStore ()
112+ config = {
113+ "retrieval.relation_vectorization" : {
114+ "enabled" : relation_vectorization_enabled ,
115+ "write_on_import" : relation_vectorization_enabled ,
116+ }
117+ }
77118 plugin = SimpleNamespace (
78119 metadata_store = metadata_store ,
79120 graph_store = _DummyGraphStore (),
80121 vector_store = _DummyVectorStore (),
81- embedding_manager = _DummyEmbeddingManager (),
122+ embedding_manager = embedding_manager or _DummyEmbeddingManager (),
82123 relation_write_service = None ,
83- get_config = lambda key , default = None : default ,
124+ get_config = lambda key , default = None : config . get ( key , default ) ,
84125 _is_embedding_degraded = lambda : False ,
85126 _allow_metadata_only_write = lambda : True ,
86- write_paragraph_vector_or_enqueue = None ,
87127 )
88128 manager = ImportTaskManager (plugin )
89129 return manager , metadata_store
@@ -260,3 +300,100 @@ async def test_persist_processed_chunk_skips_invalid_nested_items() -> None:
260300 assert len (metadata_store .paragraphs ) == 1
261301 assert set (metadata_store .entities ) >= {"Alice" , "地图" }
262302 assert metadata_store .relations == [("Alice" , "持有" , "地图" )]
303+
304+
305+ @pytest .mark .asyncio
306+ async def test_persist_processed_chunk_does_not_hold_storage_lock_during_embedding () -> None :
307+ embedding_manager = _DummyEmbeddingManager (delay = 0.05 )
308+ manager , metadata_store = _build_manager (embedding_manager = embedding_manager )
309+ file_record = SimpleNamespace (source_path = "" , source_kind = "paste" , name = "demo.txt" )
310+
311+ await asyncio .gather (
312+ manager ._persist_processed_chunk (
313+ file_record ,
314+ ProcessedChunk (
315+ type = KnowledgeType .FACTUAL ,
316+ source = SourceInfo (file = "demo.txt" , offset_start = 0 , offset_end = 4 ),
317+ chunk = ChunkContext (chunk_id = "chunk-1" , index = 0 , text = "第一段事实" ),
318+ data = {},
319+ ),
320+ ),
321+ manager ._persist_processed_chunk (
322+ file_record ,
323+ ProcessedChunk (
324+ type = KnowledgeType .FACTUAL ,
325+ source = SourceInfo (file = "demo.txt" , offset_start = 5 , offset_end = 9 ),
326+ chunk = ChunkContext (chunk_id = "chunk-2" , index = 1 , text = "第二段事实" ),
327+ data = {},
328+ ),
329+ ),
330+ )
331+
332+ assert len (metadata_store .paragraphs ) == 2
333+ assert embedding_manager .max_inflight == 2
334+
335+
336+ @pytest .mark .asyncio
337+ async def test_relation_vector_failure_keeps_metadata_and_marks_failed () -> None :
338+ manager , metadata_store = _build_manager (
339+ embedding_manager = _DummyEmbeddingManager (fail_for = "关系是持有" ),
340+ relation_vectorization_enabled = True ,
341+ )
342+
343+ relation_hash = await manager ._add_relation ("Alice" , "持有" , "地图" , source_paragraph = "paragraph-1" )
344+
345+ assert relation_hash == "relation-1"
346+ assert metadata_store .relations == [("Alice" , "持有" , "地图" )]
347+ assert ("relation-1" , "pending" , None , False ) in metadata_store .relation_vector_states
348+ assert metadata_store .relation_vector_states [- 1 ] == ("relation-1" , "failed" , "embedding failed" , True )
349+
350+
351+ @pytest .mark .asyncio
352+ async def test_high_concurrency_persist_processed_chunks_keep_all_writes_consistent () -> None :
353+ chunk_count = 60
354+ relations_per_chunk = 2
355+ entities_per_chunk = 5
356+ embedding_manager = _DummyEmbeddingManager (delay = 0.001 )
357+ manager , metadata_store = _build_manager (
358+ embedding_manager = embedding_manager ,
359+ relation_vectorization_enabled = True ,
360+ )
361+ file_record = SimpleNamespace (source_path = "" , source_kind = "paste" , name = "stress.txt" )
362+
363+ async def persist (index : int ) -> None :
364+ await manager ._persist_processed_chunk (
365+ file_record ,
366+ ProcessedChunk (
367+ type = KnowledgeType .FACTUAL ,
368+ source = SourceInfo (file = "stress.txt" , offset_start = index * 10 , offset_end = index * 10 + 9 ),
369+ chunk = ChunkContext (chunk_id = f"chunk-{ index } " , index = index , text = f"第 { index } 段高并发事实" ),
370+ data = {
371+ "triples" : [
372+ {"subject" : f"subject-{ index } -a" , "predicate" : "关联" , "object" : f"object-{ index } -a" },
373+ ],
374+ "relations" : [
375+ {"subject" : f"subject-{ index } -b" , "predicate" : "包含" , "object" : f"object-{ index } -b" },
376+ ],
377+ "entities" : [f"marker-{ index } " ],
378+ },
379+ ),
380+ )
381+
382+ await asyncio .wait_for (
383+ asyncio .gather (* (persist (index ) for index in range (chunk_count ))),
384+ timeout = 15 ,
385+ )
386+
387+ vector_ids = set (manager .plugin .vector_store .ids )
388+ ready_states = [state for _ , state , _ , _ in metadata_store .relation_vector_states if state == "ready" ]
389+ failed_states = [state for _ , state , _ , _ in metadata_store .relation_vector_states if state == "failed" ]
390+
391+ assert len (metadata_store .paragraphs ) == chunk_count
392+ assert len (metadata_store .relations ) == chunk_count * relations_per_chunk
393+ assert len (manager .plugin .graph_store .edges ) == chunk_count * relations_per_chunk
394+ assert len ({paragraph ["source" ] for paragraph in metadata_store .paragraphs }) == 1
395+ assert len (vector_ids ) == chunk_count * (1 + entities_per_chunk + relations_per_chunk )
396+ assert len (ready_states ) == chunk_count * relations_per_chunk
397+ assert failed_states == []
398+ assert metadata_store .paragraph_backfills == []
399+ assert embedding_manager .max_inflight > 1
0 commit comments