Skip to content

[Store] Implement dual rdma forward path #7216

[Store] Implement dual rdma forward path

[Store] Implement dual rdma forward path #7216

Workflow file for this run

name: 'Build & Test (Linux)'
on:
push:
branches: [ "main" , "P2P-Mooncake-Store"]
pull_request:
branches: [ "main" , "P2P-Mooncake-Store"]
types: [opened, synchronize, reopened, labeled]
jobs:
build:
if: >-
github.event_name == 'push' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.12']
env:
SCCACHE_GHA_ENABLED: "true"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install and start etcd
run: |
wget https://github.com/etcd-io/etcd/releases/download/v3.6.1/etcd-v3.6.1-linux-amd64.tar.gz
tar xzf etcd-v3.6.1-linux-amd64.tar.gz
sudo mv etcd-v3.6.1-linux-amd64/etcd* /usr/local/bin/
etcd --advertise-client-urls http://127.0.0.1:2379 --listen-client-urls http://127.0.0.1:2379 &
sleep 3 # Give etcd time to start
etcdctl --endpoints=http://127.0.0.1:2379 endpoint health
shell: bash
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.24
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
sub-packages: '["nvcc"]'
- name: Install coverage tools
run: |
sudo apt-get update
sudo apt-get install -y lcov gcovr
- name: Set up coverage compilation flags
run: |
echo "Setting up coverage compilation flags..."
echo "CXXFLAGS=--coverage" >> $GITHUB_ENV
echo "CFLAGS=--coverage" >> $GITHUB_ENV
echo "LDFLAGS=--coverage" >> $GITHUB_ENV
shell: bash
- name: Run sccache-cache
uses: mozilla-actions/sccache-action@v0.0.9
- name: Configure sccache
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Run sccache stat for check
shell: bash
run: ${SCCACHE_PATH} --show-stats
- name: Configure project with coverage support
run: |
sudo apt update -y
sudo apt install -y ninja-build
sudo bash -x dependencies.sh -y
mkdir build
cd build
cmake -G Ninja .. -DUSE_HTTP=ON -DUSE_CXL=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_ASAN=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Debug
shell: bash
- name: Build project
run: |
cd build
cmake --build .
sudo cmake --install .
shell: bash
- name: Build nvlink_allocator.so
run: |
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
cd mooncake-transfer-engine/nvlink-allocator
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/
shell: bash
- name: Start Metadata Server
run: |
cd mooncake-transfer-engine/example/http-metadata-server-python
pip install aiohttp
python ./bootstrap_server.py &
shell: bash
- name: Test (in build env) with coverage
run: |
cd build
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
ldconfig -v || echo "always continue"
MC_METADATA_SERVER=http://127.0.0.1:8080/metadata DEFAULT_KV_LEASE_TTL=500 ctest -j --output-on-failure
shell: bash
- name: Generate coverage report
id: coverage
run: |
cd build
echo "=== Starting coverage report generation ==="
echo "Current directory: $(pwd)"
echo "=== Looking for .gcda files ==="
find . -name "*.gcda" 2>/dev/null | head -10 || echo "No .gcda files found"
echo "=== Running lcov ==="
lcov --capture --directory . --output-file coverage.info 2>&1 || {
echo "WARNING: lcov failed to capture coverage data"
echo "Creating minimal lcov-compliant coverage file to allow CI to continue"
echo "TN:dummy" > coverage.filtered.info
echo "SF:/dev/null" >> coverage.filtered.info
echo "DA:0,0" >> coverage.filtered.info
echo "end_of_record" >> coverage.filtered.info
echo "coverage_failed=true" >> $GITHUB_OUTPUT
exit 0 # Exit successfully, do not block CI
}
echo "=== Processing coverage data ==="
lcov --remove coverage.info '/usr/*' '*/test/*' '*/third_party/*' --output-file coverage.filtered.info 2>&1 || true
echo "=== Generating HTML report ==="
genhtml coverage.filtered.info --output-directory coverage_report 2>&1 || echo "genhtml failed, continuing..."
echo "=== Coverage summary ==="
lcov --list coverage.filtered.info 2>&1 || echo "lcov list failed"
echo "=== Coverage report generation completed ==="
shell: bash
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
files: build/coverage.filtered.info
flags: unittests
name: code-coverage-report
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: false
continue-on-error: true
- name: Check coverage status
if: always()
run: |
if [ "${{ steps.coverage.outputs.coverage_failed }}" = "true" ]; then
echo "⚠️ Coverage collection failed but CI continued"
echo "::warning::Code coverage collection failed. Please check the build logs."
else
echo "✅ Coverage collected successfully"
fi
- name: Generate Python version tag
id: generate_tag_build
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Build Python wheel
run: |
# Build wheel with specific Python version
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_build.outputs.python_version_tag }} ./scripts/build_wheel.sh
shell: bash
build-musa:
if: >-
github.event_name == 'push' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
steps:
- uses: actions/checkout@v4
- name: Mark repository as safe
run: git config --global --add safe.directory $GITHUB_WORKSPACE
shell: bash
- name: Configure project
run: |
apt update -y
apt install -y ninja-build
bash -x dependencies.sh -y
mkdir build
cd build
cmake -G Ninja .. -DUSE_MUSA=ON -DUSE_MNNVL=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DUSE_CXL=ON -DUSE_TCP=ON -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF
shell: bash
- name: Build project
run: |
cd build
source ~/.bashrc
cmake --build .
cmake --install .
shell: bash
test-wheel-ubuntu:
needs: build-flags
strategy:
matrix:
ubuntu-version: [ubuntu-22.04, ubuntu-24.04]
python-version: ['3.10', '3.12']
runs-on: ${{ matrix.ubuntu-version }}
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Generate Python version tag
id: generate_tag_test
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Download wheel artifact
uses: actions/download-artifact@v4
with:
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_test.outputs.python_version_tag }}
path: mooncake-wheel/dist
- name: Verify wheel file exists
run: |
ls -la mooncake-wheel/dist/
if [ ! -f mooncake-wheel/dist/*.whl ]; then
echo "ERROR: No wheel file found in mooncake-wheel/dist/"
exit 1
fi
shell: bash
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
df -h
- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.24
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
- name: Run installation test script
run: |
bash scripts/test_installation.sh
shell: bash
- name: Start metadata server
run: |
source test_env/bin/activate
mooncake_http_metadata_server --port 8080 &
shell: bash
- name: Run tests with ssd
run: |
source test_env/bin/activate
MC_STORE_MEMCPY=false TEST_SSD_OFFLOAD_IN_EVICT=true ./scripts/run_tests.sh
rm -rf /tmp/mooncake_test_ssd
deactivate
shell: bash
- name: Start Mooncake Master
run: |
source test_env/bin/activate
mkdir -p /tmp/mooncake_storage
mooncake_master \
--eviction_high_watermark_ratio=0.95 \
--cluster_id=ci_test_cluster \
--port 50051 &
sleep 3
shell: bash
- name: Run Python Tensor API Performance Test (CI check)
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
python scripts/test_tensor_api.py --mode perf --iterations 1
shell: bash
- name: Run Python Async API Test (CI check)
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
python scripts/test_async_store.py
shell: bash
- name: Run RPC Communicator Bandwidth Test
run: |
source test_env/bin/activate
python mooncake-transfer-engine/tests/rpc_communicator_test.py server --url 127.0.0.1:9004 --data-size 1 &
SERVER_PID=$!
sleep 5
timeout 10 python mooncake-transfer-engine/tests/rpc_communicator_test.py client --url 127.0.0.1:9004 --threads 2 --data-size 1 || true
kill $SERVER_PID 2>/dev/null || true
wait $SERVER_PID 2>/dev/null || true
test-sglang-integration:
needs: build-flags
runs-on: ubuntu-latest
env:
tone_user_name: ${{ secrets.TONE_USER_NAME }}
steps:
- name: trigger T-one test
if: ${{ env.tone_user_name != '' }}
run: |
max_attempts=5
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Attempt $attempt: Fetching artifact..."
if curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts > artifact.json; then
echo "Successfully fetched artifact"
break
else
echo "Failed to fetch artifact. Retrying..."
sleep 60
fi
attempt=$((attempt + 1))
done
if [ $attempt -gt $max_attempts ]; then
echo "Failed to fetch artifacts after $max_attempts attempts"
exit 1
fi
cat artifact.json
artifact_id=$(jq -r ".artifacts[] | select(.name | contains(\"py312\") ) | .id" artifact.json)
signature="${{ secrets.TONE_USER_NAME }}|${{ secrets.TONE_USER_TOKEN }}|$(python3 -c "import time;print(time.time())")"
signature="$(python3 -c "import base64;print(base64.b64encode(\"$signature\".encode('utf-8')).decode('utf-8'))")"
curl -s -H 'Content-Type: application/json' -X POST -d "{\"workspace\":\"mooncake_test\",\"project\":\"mooncake-ci\",\"template\":\"mooncake-ci-test\",\"name\":\"mooncake-ci-${{ github.sha }}\",\"username\":\"${{ secrets.TONE_USER_NAME }}\",\"env_ifs\":\" \",\"env_info\":\"ARTIFACT_ID=${artifact_id} GIT_REPO=${{ github.repository }}\",\"signature\":\"$signature\"}" https://tone.openanolis.cn/api/job/create/ > job.json
if [ "$(jq .code job.json)" == 200 ]; then
echo "job created"
else
echo "job create failed"
exit 1
fi
job_id=$(jq .data.id job.json)
echo "check job status here and remember to cancel it before restart the job !"
echo "job_url: https://tone.openanolis.cn/ws/gclfnh19/test_result/${job_id}?tab=4"
echo "job_id=${job_id}" >> $GITHUB_ENV
shell: bash
- name: qurey job results
if: ${{ env.tone_user_name != '' }}
run: |
time=0
while true; do
if [ $time -gt 720 ]; then
echo "timeout"
exit 1
fi
signature="${{ secrets.TONE_USER_NAME }}|${{ secrets.TONE_USER_TOKEN }}|$(python3 -c "import time;print(time.time())")"
signature="$(python3 -c "import base64;print(base64.b64encode(\"$signature\".encode('utf-8')).decode('utf-8'))")"
curl -s -H 'Content-Type: application/json' -X POST -d "{\"username\":\"${{ secrets.TONE_USER_NAME }}\", \"signature\":\"$signature\", \"job_id\": \"${job_id}\"}" https://tone.openanolis.cn/api/job/query/ > job_status.json
if ! [ "$(jq .code job_status.json)" == 200 ]; then
echo "job query failed"
exit 1
fi
job_status=$(jq .data.job_second_state job_status.json)
if [[ $job_status =~ "pass" ]]; then
echo "job successful !"
exit 0
elif [[ $job_status =~ "fail" ]] ; then
echo "job failed or stopped !"
exit 1
fi
time=$(( time + 1))
sleep 10
done
shell: bash
build-flags:
if: >-
github.event_name == 'push' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.12']
env:
BUILD_WITH_EP: "1"
EP_TORCH_VERSIONS: "2.9.0;2.9.1;2.10.0"
TORCH_CUDA_ARCH_LIST: "8.0;9.0"
SCCACHE_GHA_ENABLED: "true"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
df -h
- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.24
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
sub-packages: '["nvcc", "nvrtc-dev"]'
non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
- name: Run sccache-cache
uses: mozilla-actions/sccache-action@v0.0.9
- name: Configure sccache
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Run sccache stat for check
shell: bash
run: ${SCCACHE_PATH} --show-stats
- name: Install dependencies
run: |
sudo apt update -y
sudo apt install -y ninja-build
sudo bash -x dependencies.sh -y
df -h
shell: bash
- name: Build transfer engine only
run: |
cd mooncake-transfer-engine
mkdir build
cd build
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
cmake -G Ninja .. -DUSE_ETCD=OFF -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
cmake --build .
sudo cmake --install .
df -h
shell: bash
- name: Configure project with all settings are ON
run: |
mkdir build
cd build
cmake -G Ninja .. -DUSE_ETCD=ON -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=ON -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
shell: bash
# TODO: lack USE_NVMEOF,USE_MNNVL
- name: Build project with all settings are ON
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
cd build
cmake --build .
sudo cmake --install .
df -h
shell: bash
- name: Configure project with unit tests and examples
run: |
cd build
cmake -G Ninja .. -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON
shell: bash
# TODO: lack WITH_RUST_EXAMPLE
- name: Build project with unit tests and examples
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
cd build
cmake --build .
sudo cmake --install .
shell: bash
- name: Configure project
run: |
cd build
rm -r */tests
cmake -G Ninja .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON -DUSE_CXL=ON -DWITH_EP=ON -DEP_TORCH_VERSIONS="2.9.0;2.9.1;2.10.0"
shell: bash
- name: Build project
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
cd build
cmake --build .
sudo cmake --install .
shell: bash
- name: Build nvlink_allocator.so
run: |
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
cd mooncake-transfer-engine/nvlink-allocator
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/
shell: bash
- name: Generate Python version tag
id: generate_tag_flags
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Build Python wheel
run: |
# Build wheel with specific Python version
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh
shell: bash
- name: Upload Python wheel artifact
uses: actions/upload-artifact@v4
with:
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}
path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
build-docker:
name: Build Docker Image
if: >-
github.event_name == 'push' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Build Docker image
run: docker build -t mooncake-app .
spell-check:
name: Spell Check with Typos
if: >-
github.event_name == 'push' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v4
- name: Spell Check Repo
uses: crate-ci/typos@v1.30.2
clang-format:
name: Check code format
if: >-
github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # Need full history for branch comparison
persist-credentials: false
- name: Install clang-format 20
run: |
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 20
sudo apt-get install -y clang-format-20
- name: Check code format
run: |
# Check script exists and is executable
if [[ ! -x ./scripts/code_format.sh ]]; then
echo "Error: code_format.sh not found or not executable"
exit 1
fi
# Determine base ref for comparison
if [ "${{ github.event_name }}" == "pull_request" ]; then
# For PRs: compare against the target branch
BASE_REF="origin/${{ github.base_ref }}"
else
# For push events: use github.event.before to handle multi-commit pushes
BEFORE_SHA="${{ github.event.before }}"
if [ "${BEFORE_SHA}" == "0000000000000000000000000000000000000000" ]; then
# New branch push, compare against default branch
BASE_REF="origin/${{ github.event.repository.default_branch }}"
else
# Normal push (single or multiple commits)
BASE_REF="${BEFORE_SHA}"
fi
fi
echo "Comparing against: ${BASE_REF}"
./scripts/code_format.sh --check --base "${BASE_REF}"
shell: bash