nil0x9
diff --git a/‎.dev_scripts/test_pydantic.py‎
Lines changed: 48 additions & 0 deletions b/‎.dev_scripts/test_pydantic.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎.github/workflows/unit_test.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unit_test.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 119 additions & 65 deletions b/‎Dockerfile‎
Lines changed: 119 additions & 65 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 2 deletions b/‎README.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎ci/scripts/CI_ENV.sh‎
Lines changed: 34 additions & 0 deletions b/‎ci/scripts/CI_ENV.sh‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎ci/scripts/test_ray_sft.py‎
Lines changed: 2 additions & 2 deletions b/‎ci/scripts/test_ray_sft.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/en/.readthedocs.yaml‎
Lines changed: 1 addition & 1 deletion b/‎docs/en/.readthedocs.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,48 @@
+#! /usr/bin/env python
+import os
+import sys
+
+
+XTUNER_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+sys.path.insert(0, XTUNER_ROOT)
+
+
+# Write by GPT-5, hope it is correct.
+# XTuner.v1 requires all pydantic.BaseModel forbit extra fields in model_config. This scritps is used to
+# scan all pydantic.BaseModel subclasses in xtuner.v1 and check their model_config.extra field.
+
+
+from pathlib import Path
+path_root = Path(__file__).parent.parent / "xtuner" / "v1"
+
+# BaseDataloader is an abstract base class. We only need to check its subclasses for model_config.
+skip = ["BaseDataloaderConfig"]
+
+basemodel_obj = []
+
+for module in path_root.rglob("*.py"):
+    if module.name == "__init__.py":
+        continue
+    relative_module = module.relative_to(path_root)
+    module_parts = relative_module.with_suffix('').parts
+    module_name = "xtuner.v1." + ".".join(module_parts)
+    try:
+        mod = __import__(module_name, fromlist=[''])
+    except ImportError as e:
+        print(f"Failed to import {module_name}: {e}")
+        continue
+    for attr_name in dir(mod):
+        attr = getattr(mod, attr_name)
+        from pydantic import BaseModel
+        if isinstance(attr, type) and issubclass(attr, BaseModel):
+            if "xtuner" in attr.__module__:
+                if attr_name in skip:
+                    print(f"{attr} skipped")
+                    continue
+                if not hasattr(attr, "model_config"):
+                    raise AssertionError(f"{attr} missing model_config")
+                if not "extra" in attr.model_config:
+                    raise AssertionError(f"{attr} model_config missing extra")
+                if attr.model_config["extra"] != "forbid":
+                    raise AssertionError(f"{attr} model_config extra is not forbid")
+            basemodel_obj.append((module_name, attr_name))
@@ -30,4 +30,4 @@ jobs:
     - name: unit-test
       run: |
         export PYTHONPYCACHEPREFIX=/tmp
-        python ci/scripts/xtuner_unittest.py "$IMAGE" "$CI_IMPORT" "export PYTHONPATH=$PWD:$LM_DEPLOY:$PYTHONPATH; pip install codetiming -i http://mirrors.h.pjlab.org.cn/pypi/simple/ --trusted-host mirrors.h.pjlab.org.cn --trusted-host pypi.i.h.pjlab.org.cn; export PYTHONPATH=/mnt/shared-storage-user/caoweihan/duanyanhui/lmdeploy:$PYTHONPATH; pytest tests/ --ignore=./tests/module/dispatcher/test_deepep.py"
+        python ci/scripts/xtuner_unittest.py "$IMAGE" "source ${{env.WORKSPACE_PREFIX}}/BASE_ENV.sh;source ci/scripts/CI_ENV.sh" "pytest tests --ignore=./tests/module/dispatcher/test_deepep.py"
@@ -66,3 +66,8 @@ repos:
         require_serial: true
         verbose: true
         types: [python]
+      - id: pydantic-extra-check
+        name: pydantic-extra-check 
+        language: system
+        entry: .dev_scripts/test_pydantic.py
+        verbose: false
@@ -1,39 +1,51 @@
 # syntax=docker/dockerfile:1.10.0
 # builder
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.01-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.03-py3
 
 ## build args
 FROM ${BASE_IMAGE} AS setup_env
 
-ARG CODESPACE=/root/codespace
-
-ARG FLASH_ATTN_DIR=/tmp/flash-attn
-ARG FLASH_ATTN3_DIR=/tmp/flash-attn3
-ARG ADAPTIVE_GEMM_DIR=/tmp/adaptive_gemm
-ARG GROUPED_GEMM_DIR=/tmp/grouped_gemm
-
 ARG TORCH_VERSION
-
 ARG PPA_SOURCE
 
-RUN if [ -d /etc/pip ] && [ -f /etc/pip/constraint.txt ]; then echo > /etc/pip/constraint.txt; fi
-RUN if [ -n "${TORCH_VERSION}" ]; then \
-        pip install torchvision torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu126 --no-cache-dir; \
-    fi
-
-# set reasonable default for CUDA architectures when building ngc image
-ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0 10.0"
-
-RUN sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
     apt update && \
     apt install --no-install-recommends ca-certificates -y && \
     apt install --no-install-recommends bc wget -y && \
     apt install --no-install-recommends build-essential sudo -y && \
     apt install --no-install-recommends git curl pkg-config tree unzip tmux \
-    openssh-server openssh-client nmap dnsutils iproute2 lsof net-tools -y && \
+    openssh-server openssh-client dnsutils iproute2 lsof net-tools zsh rclone \
+    iputils-ping telnet netcat-openbsd -y && \
     apt clean && rm -rf /var/lib/apt/lists/*
 
-RUN pip uninstall flash_attn -y
+RUN if [ -d /etc/pip ] && [ -f /etc/pip/constraint.txt ]; then echo > /etc/pip/constraint.txt; fi
+RUN pip install pystack py-spy --no-cache-dir
+RUN git config --system --add safe.directory "*"
+
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    if [ -n "${TORCH_VERSION}" ]; then \
+        pip install torchvision torch==${TORCH_VERSION} \
+        --index-url https://download.pytorch.org/whl/cu128 \
+        --extra-index-url https://download.pytorch.org/whl/cu126 \
+        --no-cache-dir; \
+    fi
+
+# set reasonable default for CUDA architectures when building ngc image
+ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0 10.0"
+
+RUN pip uninstall flash_attn opencv -y && rm -rf /usr/local/lib/python3.12/dist-packages/cv2
+
+ARG FLASH_ATTN_DIR=/tmp/flash-attn
+ARG CODESPACE=/root/codespace
+ARG FLASH_ATTN3_DIR=/tmp/flash-attn3
+ARG ADAPTIVE_GEMM_DIR=/tmp/adaptive_gemm
+ARG GROUPED_GEMM_DIR=/tmp/grouped_gemm
+ARG DEEP_EP_DIR=/tmp/deep_ep
+ARG NVSHMEM_PREFIX=/usr/local/nvshmem
+
+RUN mkdir -p $CODESPACE
+WORKDIR ${CODESPACE}
 
 # compile flash-attn
 FROM setup_env AS flash_attn
@@ -43,16 +55,14 @@ ARG FLASH_ATTN_DIR
 ARG FLASH_ATTN3_DIR
 ARG FLASH_ATTN_URL
 
-RUN mkdir -p $CODESPACE 
-WORKDIR ${CODESPACE}
-
-RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 1) && \
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    git clone $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 1) && \
     cd ${CODESPACE}/flash-attention && \
-    git checkout $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 2)
+    git checkout $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 2) && \
+    git submodule update --init --recursive --force
 
 WORKDIR ${CODESPACE}/flash-attention
 
-RUN git submodule update --init --recursive --force
 RUN cd hopper && FLASH_ATTENTION_FORCE_BUILD=TRUE pip wheel -w ${FLASH_ATTN3_DIR} -v --no-deps .
 RUN FLASH_ATTENTION_FORCE_BUILD=TRUE pip wheel -w ${FLASH_ATTN_DIR} -v --no-deps .
 
@@ -63,16 +73,14 @@ ARG CODESPACE
 ARG ADAPTIVE_GEMM_DIR
 ARG ADAPTIVE_GEMM_URL
 
-RUN mkdir -p $CODESPACE
-WORKDIR ${CODESPACE}
-
-RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 1) && \
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    git clone $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 1) && \
     cd ${CODESPACE}/AdaptiveGEMM && \
-    git checkout $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 2)
+    git checkout $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 2) && \
+    git submodule update --init --recursive --force
 
 WORKDIR ${CODESPACE}/AdaptiveGEMM
 
-RUN git submodule update --init --recursive --force
 RUN pip wheel -w ${ADAPTIVE_GEMM_DIR} -v --no-deps .
 
 # compile grouped_gemm(permute and unpermute)
@@ -82,18 +90,52 @@ ARG CODESPACE
 ARG GROUPED_GEMM_DIR
 ARG GROUPED_GEMM_URL
 
-RUN mkdir -p $CODESPACE
-WORKDIR ${CODESPACE}
-
-RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 1) && \
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    git clone $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 1) && \
     cd ${CODESPACE}/GroupedGEMM && \
-    git checkout $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 2)
+    git checkout $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 2) && \
+    git submodule update --init --recursive --force
 
 WORKDIR ${CODESPACE}/GroupedGEMM
 
-RUN git submodule update --init --recursive --force
 RUN pip wheel -w ${GROUPED_GEMM_DIR} -v --no-deps .
 
+# pypi install nvshmem and compile deepep
+FROM setup_env AS deep_ep
+
+ARG CODESPACE
+ARG DEEP_EP_DIR
+ARG DEEP_EP_URL
+# build sm90 and sm100 for deep_ep for now
+ARG TORCH_CUDA_ARCH_LIST="9.0 10.0"
+
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    curl -LO https://github.com/NVIDIA/nvshmem/releases/download/v3.4.5-0/nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
+    tar -zxvf nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
+    cd ${CODESPACE}/nvshmem_src && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_USE_GDRCOPY=0 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_BUILD_TESTS=0 \
+    NVSHMEM_BUILD_EXAMPLES=0 \
+    NVSHMEM_BUILD_HYDRA_LAUNCHER=0 \
+    NVSHMEM_BUILD_TXZ_PACKAGE=0 \
+    NVSHMEM_BUILD_PYTHON_LIB=OFF \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_PREFIX} -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 && \
+    cmake --build build --target install --parallel 32 && \
+    cd ${CODESPACE} && git clone $(echo ${DEEP_EP_URL} | cut -d '@' -f 1) && \
+    cd ${CODESPACE}/DeepEP && \
+    git checkout $(echo ${DEEP_EP_URL} | cut -d '@' -f 2) && \
+    git submodule update --init --recursive --force
+
+WORKDIR ${CODESPACE}/DeepEP
+
+RUN NVSHMEM_DIR=${NVSHMEM_PREFIX} pip wheel -w ${DEEP_EP_DIR} -v --no-deps .
 
 # integration xtuner
 FROM setup_env AS xtuner_dev
@@ -105,53 +147,65 @@ ARG FLASH_ATTN_DIR
 ARG FLASH_ATTN3_DIR
 ARG ADAPTIVE_GEMM_DIR
 ARG GROUPED_GEMM_DIR
+ARG DEEP_EP_DIR
 
 COPY --from=flash_attn ${FLASH_ATTN3_DIR} ${FLASH_ATTN3_DIR}
 COPY --from=flash_attn ${FLASH_ATTN_DIR} ${FLASH_ATTN_DIR}
 COPY --from=adaptive_gemm ${ADAPTIVE_GEMM_DIR} ${ADAPTIVE_GEMM_DIR}
 COPY --from=grouped_gemm ${GROUPED_GEMM_DIR} ${GROUPED_GEMM_DIR}
+COPY --from=deep_ep ${DEEP_EP_DIR} ${DEEP_EP_DIR}
+COPY --from=deep_ep ${NVSHMEM_PREFIX} ${NVSHMEM_PREFIX}
 
 RUN unzip ${FLASH_ATTN_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${FLASH_ATTN3_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${ADAPTIVE_GEMM_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 RUN unzip ${GROUPED_GEMM_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
+RUN unzip ${DEEP_EP_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
 
-ARG XTUNER_URL
-ARG XTUNER_COMMIT
+# install sglang and its runtime requirements
+ARG SGLANG_VERSION
+
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+   pip install sglang==${SGLANG_VERSION} sgl_kernel pybase64 orjson uvloop setproctitle msgspec \
+   compressed_tensors python-multipart torch_memory_saver \
+   grpcio-tools==1.75.1 hf_transfer interegular llguidance==0.7.11 \
+   xgrammar==0.1.24 blobfile==3.0.0 flashinfer_python==0.4.0 --no-cache-dir --no-deps
+
+# install lmdeploy and its missing runtime requirements
 ARG LMDEPLOY_VERSION
 ARG LMDEPLOY_URL
 
-## install xtuner
-RUN mkdir -p $CODESPACE
-WORKDIR ${CODESPACE}
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    pip install fastapi fire openai outlines \
+        partial_json_parser ray[default] shortuuid uvicorn \
+        'pydantic>2' openai_harmony --no-cache-dir && \
+    if [ -n "${LMDEPLOY_VERSION}" ]; then \
+        pip install lmdeploy==${LMDEPLOY_VERSION} --no-deps --no-cache-dir; \
+    else \
+        git clone $(echo ${LMDEPLOY_URL} | cut -d '@' -f 1) && \
+        cd ${CODESPACE}/lmdeploy && \
+        git checkout $(echo ${LMDEPLOY_URL} | cut -d '@' -f 2) && \
+        pip install . -v --no-deps --no-cache-dir; \
+    fi
 
-#RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${XTUNER_URL} | cut -d '@' -f 1) && \
-    #cd ${CODESPACE}/xtuner && \
-    #git checkout $(echo ${XTUNER_URL} | cut -d '@' -f 2) 
+## install xtuner
+ARG XTUNER_URL
+ARG XTUNER_COMMIT
+#RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+#   git clone $(echo ${XTUNER_URL} | cut -d '@' -f 1) && \
+#   cd ${CODESPACE}/xtuner && \
+#   git checkout $(echo ${XTUNER_URL} | cut -d '@' -f 2) 
 COPY . ${CODESPACE}/xtuner
 
 WORKDIR ${CODESPACE}/xtuner
-RUN export HTTPS_PROXY=$HTTPS_PROXY \
-  && export https_proxy=$HTTPS_PROXY \
-  && pip install liger-kernel parametrize --no-cache-dir \
-  && pip install . -v --no-cache-dir
+RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
+    pip install .[all] -v --no-cache-dir
 
-RUN pip install pystack py-spy --no-cache-dir
-RUN git config --system --add safe.directory "*"
-
-# install lmdeploy and its missing runtime requirements
-RUN pip install fastapi fire openai outlines \
-    partial_json_parser ray[default] shortuuid uvicorn \
-    'numpy<2.0.0' \
-    python-sat[aiger,approxmc,cryptosat,pblib] distance Faker --no-cache-dir
 WORKDIR ${CODESPACE}
-RUN if [ -n "${LMDEPLOY_VERSION}" ]; then \
-        pip install lmdeploy==${LMDEPLOY_VERSION} --no-deps --no-cache-dir; \
-    else \
-        git clone -c https.proxy=$HTTPS_PROXY $(echo ${LMDEPLOY_URL} | cut -d '@' -f 1) && \
-        cd ${CODESPACE}/lmdeploy && \
-        git checkout $(echo ${LMDEPLOY_URL} | cut -d '@' -f 2) && \
-        pip install . -v --no-deps --no-cache-dir; \
+
+# nccl update for torch 2.6.0
+RUN if [ "x${TORCH_VERSION}" = "x2.6.0" ]; then \
+        pip install nvidia-nccl-cu12==2.25.1 --no-cache-dir; \
     fi
 
 # setup sysctl
 
@@ -57,11 +57,10 @@ XTuner V1 is a next-generation LLM training engine specifically designed for ult
 
 
 <div align=center>
-  <img src="https://github.com/user-attachments/assets/98519a93-1ce8-49f0-a7ab-d7968c9d67a6" style="width:90%">
+  <img src="https://github.com/user-attachments/assets/c4fb2bb4-56bd-4f1c-8188-7f5370314cf8" style="width:90%">
 </div>
 
 
-
 ## 🔥 Roadmap
 
 XTuner V1 is committed to continuously improving training efficiency for pre-training, instruction fine-tuning, and reinforcement learning of ultra-large MoE models, with special focus on Ascend NPU optimization.
 
@@ -0,0 +1,34 @@
+#!/bin/bash
+export QWEN3_VL_MOE_PATH=${CI_SHARE_MODEL}/Qwen3-VL-30B-A3B-Instruct_MOE
+export QWEN3_VL_DENSE_PATH=${CI_SHARE_MODEL}/Qwen3-VL-4B-Instruct
+export INTERN_VL_1B_PATH=${CI_SHARE_MODEL}/InternVL3_5-1B-HF
+export VIDEO_ROOT=${CI_SHARE_DATA}/images
+export QWEN3_4B_PATH=${CI_SHARE_MODEL}/Qwen3-4B-Instruct-2507
+export ROLLOUT_DATA_PATH=${CI_SHARE_DATA}/gsm8k/train.jsonl
+export DEEPSEEK_V3_PATH=${CI_SHARE_MODEL}/DeepSeek-V3.1
+export GPT_OSS_MINI_PATH=${CI_SHARE_MODEL}/gpt-oss-20b-bf16
+export ROLLOUT_TEST_DATA_PATH=${CI_SHARE_DATA}/gsm8k/test.jsonl
+export VERL_ROLLOUT_DATA_PATH=${CI_SHARE_DATA}/verl-rollout-step0.jsonl
+export QWEN3_PATH=${CI_SHARE_MODEL}/Qwen3-8B
+export QWEN3_VL_PATH=${CI_SHARE_MODEL}/Qwen2.5-VL-3B-Instruct
+export QWEN3_MOE_PATH=${CI_SHARE_MODEL}/Qwen3-30B-A3B
+export INTERNS1_DENSE_PATH=${CI_SHARE_MODEL}/intern-s1-mini
+export ROLLOUT_MODEL_PATH=${CI_SHARE_MODEL}/Qwen3-8B
+export ALPACA_PATH=${CI_SHARE_DATA}/alpaca
+export INTERNS1_DATA_META=${CI_SHARE_DATA}/vlm_ci_data.json
+export ROLLOUT_DAPO_DATA_PATH=${CI_SHARE_DATA}/rl_test_judger_dapo_math_data.jsonl
+export GEO_ROLLOUT_DATA_PATH=${CI_SHARE_DATA}/rl_test_judge_geo_data.jsonl
+export TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
+export XTUNER_DETERMINISTIC=true
+export XTUNER_USE_LMDEPLOY=1
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTHONPYCACHEPREFIX=/tmp
+export TRITON_CACHE_DIR=/tmp/.triton
+export PYTEST_ADDOPTS='-o cache_dir=/tmp/.pytest_cache'
+
+proxy_off
+pip install -e .
+pip install openai-harmony
+pip install numpy==1.26.4
+
+export PYTHONPATH=${LM_DEPLOY}:$PYTHONPATH
@@ -12,8 +12,8 @@
 from xtuner.v1.model.moe.moe import BalancingLossConfig, ZLossConfig
 from xtuner.v1.datasets import FTDPTokenizeFnConfig
 import ray
-from xtuner.v1.ray.train import TrainingWorker
-from xtuner.v1.ray.accelerator import AutoAcceleratorWorkers, AcceleratorResourcesConfig
+from xtuner.v1.rl.base.worker import TrainingWorker
+from xtuner.v1.ray.base import AutoAcceleratorWorkers, AcceleratorResourcesConfig
 from xtuner.v1.train import TrainerConfig
 from xtuner.v1.train.trainer import Trainer
 from xtuner.v1.loss.ce_loss import CELossConfig
 
@@ -3,7 +3,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.12"
 
 formats:
   - epub