Fix query expansion with new retrieval

vkehfdl1 · vkehfdl1 · commit 81d84f4f7913 · 2025-07-01T15:50:48.000+09:00
diff --git a/autorag/autorag/nodes/queryexpansion/run.py b/autorag/autorag/nodes/queryexpansion/run.py
@@ -2,14 +2,15 @@
 import os
 import pathlib
 from copy import deepcopy
-from typing import List, Dict, Optional
+from typing import List, Dict, Optional, Union
 
 import pandas as pd
 
-from autorag.nodes.retrieval.run import evaluate_retrieval_node
+from autorag.evaluation import evaluate_retrieval
 from autorag.schema.metricinput import MetricInput
 from autorag.strategy import measure_speed, filter_by_threshold, select_best
 from autorag.support import get_support_modules
+from autorag.utils.cast import cast_retrieve_infos
 from autorag.utils.util import make_combinations, explode
 
 logger = logging.getLogger("AutoRAG")
@@ -217,14 +218,16 @@ def evaluate_one_query_expansion_node(
 			zip(retrieval_funcs, retrieval_params),
 		)
 	)
+	# Cast each retrieval results
+	retrieval_result_dicts = list(map(cast_retrieve_infos, retrieval_results))
 	evaluation_results = list(
 		map(
 			lambda x: evaluate_retrieval_node(
 				x,
 				metric_inputs,
 				metrics,
 			),
-			retrieval_results,
+			retrieval_result_dicts,
 		)
 	)
 	best_result, _ = select_best(
@@ -274,3 +277,32 @@ def make_retrieval_callable_params(strategy_dict: Dict):
 		)
 	)
 	return explode(modules, param_combinations)
+
+
+def evaluate_retrieval_node(
+	result_dict: Dict,
+	metric_inputs: List[MetricInput],
+	metrics: Union[List[str], List[Dict]],
+) -> pd.DataFrame:
+	"""
+	Evaluate retrieval node from retrieval node result dataframe.
+
+	:param result_df: The result dataframe from a retrieval node.
+	:param metric_inputs: List of metric input schema for AutoRAG.
+	:param metrics: Metric list from input strategies.
+	:return: Return result_df with metrics columns.
+	    The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names.
+	"""
+
+	@evaluate_retrieval(
+		metric_inputs=metric_inputs,
+		metrics=metrics,
+	)
+	def evaluate_this_module(_dict: Dict):
+		return (
+			_dict["retrieved_contents"],
+			_dict["retrieved_ids"],
+			_dict["retrieve_scores"],
+		)
+
+	return evaluate_this_module(result_dict)
diff --git a/docs/source/migration.md b/docs/source/migration.md
@@ -151,3 +151,7 @@ node_lines:
 ```
 
 This YAML file do the same thing as the previous v0.3.7 version.
+
+Also, you’re no longer able to use the hybrid retrieval node in the `query_expansion` node as `retrieval_modules`.
+We’re considering to add this feature in the future, but for now, you can use semantic and lexical retrieval nodes to evaluate query expansion.
+For most cases, you don't need to use hybrid retrieval node in the `query_expansion` node.
diff --git a/docs/source/nodes/query_expansion/query_expansion.md b/docs/source/nodes/query_expansion/query_expansion.md
@@ -35,11 +35,12 @@ Please refer to the parameter of [retrieval Node](../retrieval/retrieval.md) for
 1. **Metrics**: Metrics such as `retrieval_f1`,`retrieval_recall`, and `retrieval_precision` are used to evaluate the performance of the query expansion process through its impact on retrieval outcomes.
 2. **Speed Threshold**: `speed_threshold` is applied across all nodes, ensuring that any method exceeding the average processing time for a query is not used.
 3. **Top_k**: This parameter specifies the number of top results to consider during the retrieval evaluation phase.
-4. **Retrieval Modules**: The query expansion node can use all modules and module parameters from the retrieval node, including:
+4. **Retrieval Modules**: The query expansion node can use modules and module parameters from the lexical retrieval and semantic retrieval node, including:
     - [bm25](../retrieval/bm25.md)
-    - [vectordb](../retrieval/vectordb.md): with `embedding_model` parameter
-    - [hybrid_rrf](../retrieval/hybrid_rrf.md): with `target_modules` and `rrf_k` parameters
-    - [hybrid_cc](../retrieval/hybrid_cc.md): with `target_modules` and `weights` parameters
+    - [vectordb](../retrieval/vectordb.md): with `vectordb` parameter
+```{warning}
+You cannot use the hybrid retrieval modules in the query expansion node.
+```
 
 ### Example config.yaml file
 ```yaml
diff --git a/tests/autorag/nodes/queryexpansion/test_query_expansion_run.py b/tests/autorag/nodes/queryexpansion/test_query_expansion_run.py
@@ -14,7 +14,9 @@
 from autorag.nodes.queryexpansion import QueryDecompose, HyDE
 from autorag.nodes.queryexpansion.run import evaluate_one_query_expansion_node
 from autorag.nodes.queryexpansion.run import run_query_expansion_node
-from autorag.nodes.retrieval import BM25, VectorDB, HybridCC
+from autorag.nodes.lexicalretrieval import BM25
+from autorag.nodes.semanticretrieval import VectorDB
+from autorag.nodes.hybridretrieval import HybridCC
 from autorag.nodes.semanticretrieval.vectordb import vectordb_ingest_api
 from autorag.schema.metricinput import MetricInput
 from autorag.utils.util import load_summary_file, get_event_loop
@@ -112,18 +114,6 @@ def test_evaluate_one_query_expansion_node_vectordb(node_line_dir):
     retrieval_params = [
         {"top_k": 3, "vectordb": "chroma_large"},
         {"top_k": 5, "vectordb": "chroma_small"},
-        {
-            "top_k": 5,
-            "target_modules": ("bm25", "vectordb"),
-            "target_module_params": (
-                {"top_k": 3, "bm25_tokenizer": "gpt2"},
-                {
-                    "top_k": 3,
-                    "vectordb": "chroma_large",
-                },
-            ),
-            "weight": 0.36,
-        },
     ]
     base_test_evaluate_one_query_expansion_node(
         node_line_dir, retrieval_funcs, retrieval_params