fix: eliminate redundant RAG model reloading that defeated parallelization

ARYANPATEL-BIT · ARYANPATEL-BIT · commit f04e3a923429 · 2026-04-11T01:17:15.000+05:30
Signed-off-by: Aryan Patel &lt;aryan.patel7291@gmail.com&gt;
diff --git a/examples/government_rag/singletask_learning_bench/testalgorithms/basemodel.py b/examples/government_rag/singletask_learning_bench/testalgorithms/basemodel.py
@@ -52,7 +52,6 @@ def __init__(self, **kwargs):
         self.gpu_lock = threading.Lock()
         self.rag = None
         self.get_model_response = self.get_model_response_qianfan
-        pass
 
     def get_model_response_deepseek(self, prompt):
         # Please install OpenAI SDK first: `pip3 install openai`
@@ -156,9 +155,14 @@ def get_access_token():
 
     def preprocess(self, **kwargs):
         print("BaseModel preprocess")
-        # input('stop here preprocess')
+        # Load the RAG model and vector store once during setup.
+        # Previously, GovernmentRAG was instantiated inside process_query()
+        # for every single query, which reloaded the embedding model and
+        # ChromaDB each time. That made the ThreadPoolExecutor useless
+        # since all threads were serialized by the gpu_lock while waiting
+        # for the slow model loading to finish.
         self.rag = GovernmentRAG(model_name="/home/icyfeather/models/bge-m3", device="cuda", persist_directory="./chroma_db")
-        LOGGER.info("RAG initialized")
+        LOGGER.info("RAG initialized once for all queries")
 
     def train(self, train_data, valid_data=None, **kwargs):
         print("BaseModel doesn't need to train")
@@ -168,42 +172,49 @@ def save(self, model_path):
         print("BaseModel doesn't need to save")
 
     def process_query(self, query: str, ground_truth: str, location: str, rag_type: str) -> str:
-        """Process a single query with the specified RAG type."""
+        """Process a single query with the specified RAG type.
+        
+        The RAG instance (self.rag) is initialized once in preprocess() and
+        reused here. The gpu_lock only protects the brief vector similarity
+        search, not any model loading, so threads can actually run in parallel
+        for the LLM API calls.
+        """
         try:
             if rag_type == "[model]":
+                # No RAG needed, just ask the LLM directly
                 response = self.get_model_response(query)
             else:
+                # Run the embedding-based retrieval under the GPU lock.
+                # This is a quick vector search, not a full model reload.
                 with self.gpu_lock:
-                    if rag_type == "[global]":
-                        if self.rag is None:
-                            self.rag = GovernmentRAG(model_name="/home/icyfeather/models/bge-m3", device="cuda", persist_directory="./chroma_db")
-                    elif rag_type == "[local]":
-                        self.rag = GovernmentRAG(model_name="/home/icyfeather/models/bge-m3", device="cuda", persist_directory="./chroma_db", provinces=[location])
-                    else:  # [other]
-                        all_locations = set(self.all_locations)
-                        self.rag = GovernmentRAG(model_name="/home/icyfeather/models/bge-m3", device="cuda", persist_directory="./chroma_db", provinces=list(all_locations - set([location])))
-                    
                     relevant_docs = self.rag.query(query, k=1)
-                    
-                    # Clear GPU cache after query
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-                
-                response = self.get_model_response("在你回答问题之前，你被提供了以下可能相关的信息：" + relevant_docs + "\n现在请你回答问题：" + query)
+
+                # Build the augmented prompt with retrieved context
+                augmented_prompt = (
+                    "在你回答问题之前，你被提供了以下可能相关的信息："
+                    + relevant_docs
+                    + "\n现在请你回答问题："
+                    + query
+                )
+                response = self.get_model_response(augmented_prompt)
             
             return response + "||" + ground_truth + "||" + location + "||" + rag_type
         except Exception as e:
             LOGGER.error(f"Error in process_query: {str(e)}")
-            # Clear GPU cache in case of error
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
             raise e
 
     def predict(self, data, input_shape=None, **kwargs):
         print("BaseModel predict")
         LOGGER.info("BaseModel predict")
-        LOGGER.info(f"Dataset: {data.dataset_name}")
-        LOGGER.info(f"Description: {data.description}")
+
+        # Make sure the RAG system is ready before processing queries
+        if self.rag is None:
+            LOGGER.info("RAG not initialized yet, loading now...")
+            self.rag = GovernmentRAG(
+                model_name="/home/icyfeather/models/bge-m3",
+                device="cuda",
+                persist_directory="./chroma_db"
+            )
 
         answer_list = []
         
@@ -213,21 +224,19 @@ def predict(self, data, input_shape=None, **kwargs):
         # Create tasks for all queries
         tasks = []
         for i in range(len(data.x)):
-            # Add global task
             tasks.append((data.x[i], data.y[i], current_dir, "[global]"))
-            # Add local task
             tasks.append((data.x[i], data.y[i], current_dir, "[local]"))
-            # Add other task
             tasks.append((data.x[i], data.y[i], current_dir, "[other]"))
-            # Add model task
             tasks.append((data.x[i], data.y[i], current_dir, "[model]"))
 
-        # Process tasks in parallel using ThreadPoolExecutor
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:  # Reduced number of workers
+        # Process tasks in parallel using ThreadPoolExecutor.
+        # Now that GovernmentRAG is loaded once and shared, the threads
+        # only block briefly on the gpu_lock for vector search. The slow
+        # LLM API calls happen outside the lock and truly run in parallel.
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
             futures = [executor.submit(self.process_query, query, gt, loc, rag_type) 
                       for query, gt, loc, rag_type in tasks]
             
-            # Use tqdm to show progress
             for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing queries"):
                 try:
                     result = future.result()