TensorRT-LLM/tests/integration/defs/verl/verl_config.yml at 51f7956172e1434f9470304764f3475a04d8221a · NVIDIA/TensorRT-LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Verl configuration for CI stage

verl_config:
  repo_url: "https://github.com/volcengine/verl.git"
  repo_tag: "4cda6af"
  test_dir: "tests"

  install_commands:
    # Install gdrcopy
    - >-
      git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git &&
      (cd gdrcopy && make prefix=/usr/local lib_install) &&
      rm -rf gdrcopy
    # Install nvshmem
    - "pip install nvidia-nvshmem-cu13==3.3.20"
    # Create nvshmem symlink (needed before DeepEP build)
    - >-
      (cd /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem/lib &&
      ln -s libnvshmem_host.so.3 libnvshmem_host.so)
    # Install DeepEP
    - >-
      git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git &&
      (cd DeepEP &&
      wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch &&
      patch -p1 < deepep.patch &&
      TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install) &&
      rm -rf DeepEP
    # Install Python dependencies
    - "pip3 install --no-cache-dir --no-deps trl"
    - "pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools"
    - "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
    - "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
    - "pip3 install pytest-asyncio"
    - "pip3 install --no-cache-dir 'ray[default]==2.54.1'"


  # The environment variables to expose in the container before setting up
  env_vars:
    - "NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem"
    - "LD_LIBRARY_PATH=\"${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH\""
    - "PATH=\"${NVSHMEM_DIR}/bin:$PATH\""
    - "TRTLLM_TEST_MODEL_PATH_ROOT=/tmp/verl-models"

  # Read-only CI model cache (flat layout: /scratch.../ModelName)
  ci_model_cache: "/scratch.trt_llm_data/llm-models"

  # Models needed by verl tests (symlinks created from HF-style to CI cache paths)
  required_models:
    - "Qwen/Qwen2.5-0.5B-Instruct"
    - "Qwen/Qwen2.5-1.5B-Instruct"
    - "Qwen/Qwen2.5-VL-7B-Instruct"