33
44import os
55import logging
6- import tempfile
76from pathlib import Path
87from typing import Any , Dict , List , Optional , Tuple
98
10- import numpy as np
11- import imageio .v3 as iio
12-
139import re
1410from utils .tags import strip_tags , parse_exclusions , has_no_rerank , has_refine
1511
2420from generator .schema import CandidateDoc , NoToolReason
2521from utils .file_validator import FileValidator
2622
27- from utils .image_meta import summarize_image_metadata , detect_ext_token
28- from utils .image_io import load_any
29- from utils .previews import mip_montage , slice_gif , stack_sweep_gif , contact_sheet_slices
23+ from utils .image_meta import detect_ext_token
24+ from utils .previews import _build_preview_for_vlm , _cleanup_old_previews
25+ from utils .utils import _best_runnable_link
3026
3127log = logging .getLogger ("pipeline" )
3228
@@ -35,7 +31,6 @@ class RAGImagingPipeline:
3531 def __init__ (
3632 self ,
3733 docs : List [SoftwareDoc ],
38- hf_token : Optional [str ] = None ,
3934 index_dir : Optional [str ] = None ,
4035 ):
4136 self .index_dir = Path (index_dir or os .getenv ("RAG_INDEX_DIR" , "artifacts/rag_index" ))
@@ -44,10 +39,9 @@ def __init__(
4439 self .embedder = LocalBGEEmbedder ()
4540 self .reranker = CrossEncoderReranker ()
4641 self .selector_vlm = VLMToolSelector ()
47- self .hf_token = hf_token
4842
4943 try :
50- self . _cleanup_old_previews (hours = 24 )
44+ _cleanup_old_previews (hours = 24 )
5145 except Exception :
5246 logging .getLogger ("api" ).exception ("Preview cleanup at init failed; continuing" )
5347
@@ -183,90 +177,6 @@ def _norm(s: str) -> str:
183177 return hits , {"top" : top , "second" : second , "margin" : margin }
184178
185179
186- def _build_preview_for_vlm (self , image_paths : Optional [List [str ]]) -> Tuple [Optional [str ], Optional [str ]]:
187- if not image_paths :
188- return None , None
189-
190- meta_text = None
191- try :
192- meta_text = summarize_image_metadata (image_paths )
193- except Exception :
194- log .exception ("Image metadata summarization failed; continuing without metadata." )
195-
196- for p in image_paths :
197- try :
198- data , meta = load_any (p )
199- shp = getattr (meta , "shape" , None ) or meta .get ("shape" )
200- if shp is None :
201- shp = getattr (data , "shape" , None )
202- if shp is None :
203- continue
204-
205- tmpdir = Path (tempfile .mkdtemp (prefix = "preview_" ))
206-
207- if len (shp ) == 3 :
208- png_path = tmpdir / "slices_grid.png"
209- gif_path = tmpdir / "sweep.gif"
210- try :
211- contact_sheet_slices (data , png_path , max_slices = 36 , grid_cols = 6 )
212- except Exception :
213- try :
214- mip_montage (data , png_path )
215- except Exception :
216- pass
217- try :
218- stack_sweep_gif (data , gif_path , fps = 12 , max_frames = 64 )
219- except Exception :
220- pass
221- if png_path .exists ():
222- return str (png_path ), meta_text
223- if gif_path .exists ():
224- return str (gif_path ), meta_text
225-
226- if len (shp ) == 4 :
227- vol = np .asarray (data ).mean (axis = - 1 )
228- out = tmpdir / "sweep.gif"
229- step = max (1 , vol .shape [2 ] // 64 )
230- slice_gif (vol , out , axis = 2 , step = step , fps = 12 )
231- return str (out ), meta_text
232-
233- if len (shp ) == 2 :
234- out = tmpdir / "image.png"
235- arr = data
236- if arr .dtype != np .uint8 :
237- arr = (np .clip (arr , 0 , 1 ) * 255 ).astype (np .uint8 )
238- iio .imwrite (str (out ), arr )
239- return str (out ), meta_text
240- except Exception :
241- continue
242-
243- return None , meta_text
244-
245- def _cleanup_old_previews (self , hours : int = 24 ) -> None :
246- """
247- Delete preview_* folders older than `hours` from the system temp dir.
248- Best-effort; ignore errors.
249- """
250- import time , tempfile
251- root = Path (tempfile .gettempdir ())
252- cutoff = time .time () - hours * 3600
253- try :
254- for p in root .glob ("preview_*" ):
255- try :
256- if p .is_dir () and p .stat ().st_mtime < cutoff :
257- for sub in p .glob ("**/*" ):
258- try :
259- if sub .is_file ():
260- sub .unlink ()
261- except Exception :
262- pass
263- p .rmdir ()
264- except Exception :
265- pass
266- except Exception :
267- logging .getLogger ("api" ).exception ("Preview cleanup failed" )
268-
269-
270180 def _select (self , hits , image_meta_text , user_task , preview_path ):
271181 num_choices = int (os .getenv ("NUM_CHOICES" , "3" ))
272182
@@ -352,38 +262,6 @@ def _select(self, hits, image_meta_text, user_task, preview_path):
352262 sel_json ["choices" ] = sel_json .get ("choices" , [])[:num_choices ]
353263 return sel_json
354264
355- def _best_runnable_link (self , doc : SoftwareDoc ) -> Optional [str ]:
356- def priority (item ) -> float :
357- if isinstance (item , dict ) and "priority" in item :
358- try :
359- return float (item ["priority" ])
360- except Exception :
361- pass
362- return 1e9
363-
364- def extract_url (item ) -> Optional [str ]:
365- if isinstance (item , str ):
366- u = item .strip ()
367- return u or None
368- if isinstance (item , dict ):
369- for k in ("url" , "href" , "link" , "contentUrl" ):
370- u = item .get (k )
371- if isinstance (u , str ) and u .strip ():
372- return u .strip ()
373- return None
374-
375- for items in (getattr (doc , "runnable_example" , None ) or [], getattr (doc , "has_executable_notebook" , None ) or []):
376- try :
377- items_sorted = sorted (items , key = priority )
378- except Exception :
379- items_sorted = items
380- for it in items_sorted :
381- url = extract_url (it )
382- if url :
383- return url
384-
385- return None
386-
387265 def recommend_and_link (
388266 self ,
389267 image_paths : Optional [List [str ]],
@@ -423,7 +301,7 @@ def _norm(s: str) -> str:
423301 preview_path = None
424302 image_meta_text = ""
425303 try :
426- preview_path , image_meta_text = self . _build_preview_for_vlm (image_paths or [])
304+ preview_path , image_meta_text = _build_preview_for_vlm (image_paths or [])
427305 except Exception :
428306 image_meta_text = ""
429307
@@ -548,7 +426,7 @@ def _fallback_score(i: int, hit: dict) -> float:
548426 for choice in result ["choices" ]:
549427 doc = next ((h ["doc" ] for h in hits if getattr (h ["doc" ], "name" , "" ) == choice ["name" ]), None )
550428 if doc :
551- link = self . _best_runnable_link (doc )
429+ link = _best_runnable_link (doc )
552430 if link :
553431 choice ["demo_link" ] = link
554432
0 commit comments