@@ -146,6 +146,7 @@ def launch_disaggregated_llm(
146146 max_workers : int = 16 ,
147147 enable_perf = False ,
148148 extra_env : Optional [Dict [str , str ]] = None ,
149+ gen_extra_env : Optional [Dict [str , str ]] = None ,
149150):
150151 temp_dir = tempfile .TemporaryDirectory ()
151152 disaggregated_serving_config_path = os .path .join (
@@ -299,6 +300,8 @@ def _apply_perf_flags(cfg: Optional[Dict[str, Any]]):
299300
300301 for i , port in enumerate (gen_ports ):
301302 env = base_env .copy ()
303+ if gen_extra_env :
304+ env .update (gen_extra_env )
302305 env ["TRTLLM_USE_UCX_KVCACHE" ] = "1"
303306 # Need to set UCX_TLS to ^ib to avoid hangs on CI B200 cluster.
304307 env ["UCX_TLS" ] = "^ib"
@@ -633,6 +636,50 @@ def test_auto_dtype(self, ctx_disable_overlap_scheduler,
633636 self .MODEL_PATH ) as llm :
634637 run_accuracy_test (llm , self .MODEL_NAME , ["MMLU" , "GSM8K" ])
635638
639+ @skip_pre_hopper
640+ @pytest .mark .skip_less_device (2 )
641+ def test_kv_cache_v2_nixl_python (self ):
642+ """Test with use_kv_cache_manager_v2=True, block_reuse=False, backend=NIXL, transceiver_runtime=PYTHON."""
643+ ctx_server_config = {
644+ "disable_overlap_scheduler" : True ,
645+ "kv_cache_config" : {
646+ "enable_block_reuse" : False ,
647+ "use_kv_cache_manager_v2" : True
648+ },
649+ "cache_transceiver_config" : {
650+ "backend" : "NIXL" ,
651+ "transceiver_runtime" : "PYTHON"
652+ }
653+ }
654+ gen_server_config = {
655+ "disable_overlap_scheduler" : False ,
656+ "kv_cache_config" : {
657+ "enable_block_reuse" : False ,
658+ "use_kv_cache_manager_v2" : True
659+ },
660+ "cache_transceiver_config" : {
661+ "backend" : "NIXL" ,
662+ "transceiver_runtime" : "PYTHON"
663+ }
664+ }
665+ disaggregated_server_config = {
666+ "hostname" : "localhost" ,
667+ "port" : 8000 ,
668+ "backend" : "pytorch" ,
669+ "context_servers" : {
670+ "num_instances" : 1 ,
671+ "urls" : ["localhost:8001" ]
672+ },
673+ "generation_servers" : {
674+ "num_instances" : 1 ,
675+ "urls" : ["localhost:8002" ]
676+ }
677+ }
678+ with launch_disaggregated_llm (disaggregated_server_config ,
679+ ctx_server_config , gen_server_config ,
680+ self .MODEL_PATH ) as llm :
681+ run_accuracy_test (llm , self .MODEL_NAME , ["GSM8K" ])
682+
636683 @pytest .mark .skip_less_device (2 )
637684 def test_ngram (self ):
638685 speculative_decoding_config = {
@@ -952,6 +999,52 @@ def test_nixl_backend(self):
952999 self .MODEL_PATH ) as llm :
9531000 run_accuracy_test (llm , self .MODEL_NAME , ["MMLU" , "GSM8K" ])
9541001
1002+ @pytest .mark .skip_less_device (2 )
1003+ @pytest .mark .skip_less_device_memory (60000 )
1004+ @skip_no_hopper
1005+ def test_gen_only_sync (self ):
1006+ """Test gen-only synchronous KV transfer path with NIXL Python transceiver.
1007+
1008+ Sets TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1 so the gen worker calls
1009+ request_and_receive_sync instead of the async path. Accuracy must be
1010+ identical to the standard async path.
1011+ """
1012+ ctx_server_config = {
1013+ "disable_overlap_scheduler" : True ,
1014+ "cache_transceiver_config" : {
1015+ "backend" : "NIXL" ,
1016+ "transceiver_runtime" : "PYTHON" ,
1017+ "max_tokens_in_buffer" : 4096 ,
1018+ },
1019+ }
1020+ gen_server_config = {
1021+ "disable_overlap_scheduler" : True ,
1022+ "cache_transceiver_config" : {
1023+ "backend" : "NIXL" ,
1024+ "transceiver_runtime" : "PYTHON" ,
1025+ "max_tokens_in_buffer" : 4096 ,
1026+ },
1027+ }
1028+ disaggregated_server_config = {
1029+ "hostname" : "localhost" ,
1030+ "backend" : "pytorch" ,
1031+ "context_servers" : {
1032+ "num_instances" : 1
1033+ },
1034+ "generation_servers" : {
1035+ "num_instances" : 1
1036+ },
1037+ }
1038+ with launch_disaggregated_llm (
1039+ disaggregated_server_config ,
1040+ ctx_server_config ,
1041+ gen_server_config ,
1042+ self .MODEL_PATH ,
1043+ # Apply to both servers: gen worker uses sync receive path.
1044+ extra_env = {"TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP" : "1" },
1045+ ) as llm :
1046+ run_accuracy_test (llm , self .MODEL_NAME , ["GSM8K" ])
1047+
9551048 @pytest .mark .skip_less_device (8 )
9561049 @parametrize_with_ids ("overlap_scheduler" , [True , False ])
9571050 @parametrize_with_ids ("mtp_nextn" , [0 , 2 ])
@@ -1141,6 +1234,51 @@ def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker):
11411234 self .MODEL_PATH ) as llm :
11421235 run_accuracy_test (llm , self .MODEL_NAME , ["JsonModeEval" ])
11431236
1237+ @pytest .mark .skip_less_device (2 )
1238+ @pytest .mark .skip_less_device_memory (60000 )
1239+ @skip_pre_hopper
1240+ def test_kv_cache_v2_nixl_python (self ):
1241+ """Test with use_kv_cache_manager_v2=True, block_reuse=False, backend=NIXL, transceiver_runtime=PYTHON."""
1242+ ctx_server_config = {
1243+ "disable_overlap_scheduler" : True ,
1244+ "kv_cache_config" : {
1245+ "enable_block_reuse" : False ,
1246+ "use_kv_cache_manager_v2" : True
1247+ },
1248+ "cache_transceiver_config" : {
1249+ "backend" : "NIXL" ,
1250+ "transceiver_runtime" : "PYTHON"
1251+ }
1252+ }
1253+ gen_server_config = {
1254+ "disable_overlap_scheduler" : True ,
1255+ "kv_cache_config" : {
1256+ "enable_block_reuse" : False ,
1257+ "use_kv_cache_manager_v2" : True
1258+ },
1259+ "cache_transceiver_config" : {
1260+ "backend" : "NIXL" ,
1261+ "transceiver_runtime" : "PYTHON"
1262+ }
1263+ }
1264+ disaggregated_server_config = {
1265+ "hostname" : "localhost" ,
1266+ "port" : 8000 ,
1267+ "backend" : "pytorch" ,
1268+ "context_servers" : {
1269+ "num_instances" : 1 ,
1270+ "urls" : ["localhost:8001" ]
1271+ },
1272+ "generation_servers" : {
1273+ "num_instances" : 1 ,
1274+ "urls" : ["localhost:8002" ]
1275+ }
1276+ }
1277+ with launch_disaggregated_llm (disaggregated_server_config ,
1278+ ctx_server_config , gen_server_config ,
1279+ self .MODEL_PATH ) as llm :
1280+ run_accuracy_test (llm , self .MODEL_NAME , ["GSM8K" ])
1281+
11441282
11451283@pytest .mark .timeout (DEFAULT_TEST_TIMEOUT )
11461284class TestGemma3_1BInstruct (LlmapiAccuracyTestHarness ):
@@ -1193,6 +1331,52 @@ def test_auto_dtype(self, block_reuse):
11931331 self .MODEL_PATH ) as llm :
11941332 run_accuracy_test (llm , self .MODEL_NAME , ["MMLU" , "GSM8K" ])
11951333
1334+ @pytest .mark .skip_less_device (2 )
1335+ @skip_pre_hopper
1336+ def test_kv_cache_v2_nixl_python (self ):
1337+ """Test with use_kv_cache_manager_v2=True, block_reuse=False, backend=NIXL, transceiver_runtime=PYTHON."""
1338+ ctx_server_config = {
1339+ "disable_overlap_scheduler" : True ,
1340+ "cuda_graph_config" : None ,
1341+ "kv_cache_config" : {
1342+ "enable_block_reuse" : False ,
1343+ "use_kv_cache_manager_v2" : True
1344+ },
1345+ "cache_transceiver_config" : {
1346+ "backend" : "NIXL" ,
1347+ "transceiver_runtime" : "PYTHON"
1348+ }
1349+ }
1350+ gen_server_config = {
1351+ "disable_overlap_scheduler" : True ,
1352+ "cuda_graph_config" : None ,
1353+ "kv_cache_config" : {
1354+ "enable_block_reuse" : False ,
1355+ "use_kv_cache_manager_v2" : True
1356+ },
1357+ "cache_transceiver_config" : {
1358+ "backend" : "NIXL" ,
1359+ "transceiver_runtime" : "PYTHON"
1360+ }
1361+ }
1362+ disaggregated_server_config = {
1363+ "hostname" : "localhost" ,
1364+ "port" : 8000 ,
1365+ "backend" : "pytorch" ,
1366+ "context_servers" : {
1367+ "num_instances" : 1 ,
1368+ "urls" : ["localhost:8001" ]
1369+ },
1370+ "generation_servers" : {
1371+ "num_instances" : 1 ,
1372+ "urls" : ["localhost:8002" ]
1373+ }
1374+ }
1375+ with launch_disaggregated_llm (disaggregated_server_config ,
1376+ ctx_server_config , gen_server_config ,
1377+ self .MODEL_PATH ) as llm :
1378+ run_accuracy_test (llm , self .MODEL_NAME , ["MMLU" , "GSM8K" ])
1379+
11961380
11971381@skip_pre_blackwell
11981382@pytest .mark .skip_less_device_memory (80000 )
0 commit comments