|
14 | 14 | from tensorrt_llm import LLM, SamplingParams |
15 | 15 | from tensorrt_llm._torch.attention_backend.trtllm import TrtllmAttentionMetadata |
16 | 16 | from tensorrt_llm._torch.metadata import KVCacheParams |
| 17 | +from tensorrt_llm.executor.request import LoRARequest |
17 | 18 | from tensorrt_llm.llmapi import (CudaGraphConfig, Eagle3DecodingConfig, |
18 | 19 | KvCacheConfig) |
| 20 | +from tensorrt_llm.lora_helper import LoraConfig |
19 | 21 |
|
20 | 22 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) |
21 | 23 |
|
@@ -756,8 +758,9 @@ def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool): |
756 | 758 | llm_spec = LLM(**llm_common_config, speculative_config=spec_config) |
757 | 759 |
|
758 | 760 | prompts = [ |
759 | | - "The capital of France is", "The president of the United States is", |
760 | | - "The future of AI is" |
| 761 | + "The capital of France is", |
| 762 | + "The president of the United States is", |
| 763 | + "The future of AI is", |
761 | 764 | ] |
762 | 765 |
|
763 | 766 | sampling_params = SamplingParams(max_tokens=2048, temperature=0) |
@@ -815,5 +818,68 @@ def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool): |
815 | 818 | llm_spec.shutdown() |
816 | 819 |
|
817 | 820 |
|
| 821 | +@pytest.mark.parametrize("use_cuda_graph", [True, False]) |
| 822 | +def test_eagle3_lora(use_cuda_graph: bool): |
| 823 | + """Test LoRA with 3 requests and max_batch_size=4. |
| 824 | +
|
| 825 | + This test verifies that when using LoRA modules, |
| 826 | + the system properly applies the LoRA configurations. |
| 827 | + """ |
| 828 | + attn_backend = "TRTLLM" |
| 829 | + enable_block_reuse = False |
| 830 | + use_one_model = True |
| 831 | + enable_chunked_prefill = False |
| 832 | + |
| 833 | + total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 |
| 834 | + if total_mem_gb < 35: |
| 835 | + pytest.skip("Not enough memory to load target + draft model") |
| 836 | + |
| 837 | + models_path = llm_models_root() |
| 838 | + |
| 839 | + eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B" |
| 840 | + target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct" |
| 841 | + hf_lora_dir = f"{models_path}/llama-models/luotuo-lora-7b-0.1" |
| 842 | + |
| 843 | + # Test with 3 requests and max_batch_size=4 to trigger padding |
| 844 | + max_batch_size = 4 |
| 845 | + max_draft_len = 4 |
| 846 | + kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse, |
| 847 | + max_tokens=8192) |
| 848 | + cuda_graph_config = CudaGraphConfig( |
| 849 | + batch_sizes=[1, 2, 4], enable_padding=True) if use_cuda_graph else None |
| 850 | + lora_config = LoraConfig(max_lora_rank=64, max_loras=2, max_cpu_loras=2) |
| 851 | + |
| 852 | + llm_common_config = dict( |
| 853 | + model=target_model_dir, |
| 854 | + attn_backend=attn_backend, |
| 855 | + cuda_graph_config=cuda_graph_config, |
| 856 | + max_batch_size=max_batch_size, |
| 857 | + kv_cache_config=kv_cache_config, |
| 858 | + max_seq_len=1024, |
| 859 | + enable_chunked_prefill=enable_chunked_prefill, |
| 860 | + lora_config=lora_config, |
| 861 | + ) |
| 862 | + |
| 863 | + spec_config = Eagle3DecodingConfig( |
| 864 | + max_draft_len=max_draft_len, |
| 865 | + speculative_model=eagle_model_dir, |
| 866 | + eagle3_one_model=use_one_model, |
| 867 | + ) |
| 868 | + |
| 869 | + # Create the LLM instance |
| 870 | + llm_spec = LLM(**llm_common_config, speculative_config=spec_config) |
| 871 | + |
| 872 | + prompts = [ |
| 873 | + "The capital of France is", |
| 874 | + "The president of the United States is", |
| 875 | + "The future of AI is", |
| 876 | + ] |
| 877 | + lora_requests = [LoRARequest("luotuo", 1, hf_lora_dir)] * len(prompts) |
| 878 | + |
| 879 | + sampling_params = SamplingParams(max_tokens=20, temperature=0) |
| 880 | + llm_spec.generate(prompts, sampling_params, lora_request=lora_requests) |
| 881 | + llm_spec.shutdown() |
| 882 | + |
| 883 | + |
818 | 884 | if __name__ == "__main__": |
819 | 885 | unittest.main() |
0 commit comments