Skip to content

[Store] Implement tenant metadata map isolation (#2232) #7213

[Store] Implement tenant metadata map isolation (#2232)

[Store] Implement tenant metadata map isolation (#2232) #7213

Workflow file for this run

name: 'Build & Test (Linux)'
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
types: [opened, synchronize, reopened, labeled]
workflow_dispatch: {}
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true
jobs:
build:
needs: [spell-check, clang-format, check-paths]
if: >-
(needs.check-paths.outputs.should-run-downstream == 'true' ||
github.event_name == 'workflow_dispatch') &&
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.12']
env:
CI: "true"
SCCACHE_GHA_ENABLED: "true"
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: Install and start etcd
run: |
wget https://github.com/etcd-io/etcd/releases/download/v3.6.1/etcd-v3.6.1-linux-amd64.tar.gz
tar xzf etcd-v3.6.1-linux-amd64.tar.gz
sudo mv etcd-v3.6.1-linux-amd64/etcd* /usr/local/bin/
etcd --advertise-client-urls http://127.0.0.1:2379 --listen-client-urls http://127.0.0.1:2379 &
sleep 3 # Give etcd time to start
etcdctl --endpoints=http://127.0.0.1:2379 endpoint health
shell: bash
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.24
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
sub-packages: '["nvcc"]'
- name: Install coverage tools and build utilities
run: |
sudo apt-get update
sudo apt-get install -y lcov gcovr ninja-build
- name: Test HugeTLB sizing helper
run: |
python3 scripts/test_hicache_hugepage_requirements.py
shell: bash
- name: Set up coverage compilation flags
run: |
echo "Setting up coverage compilation flags..."
echo "CXXFLAGS=--coverage" >> $GITHUB_ENV
echo "CFLAGS=--coverage" >> $GITHUB_ENV
echo "LDFLAGS=--coverage" >> $GITHUB_ENV
shell: bash
- name: Run sccache-cache
uses: mozilla-actions/sccache-action@v0.0.9
- name: Configure sccache
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Run sccache stat for check
shell: bash
run: ${SCCACHE_PATH} --show-stats
- name: Configure project with coverage support
run: |
sudo apt update -y
sudo bash -x dependencies.sh -y
mkdir build
cd build
cmake -G Ninja .. -DUSE_HTTP=ON -DUSE_CXL=ON -DUSE_UB=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_ASAN=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Debug
shell: bash
- name: Build project
run: |
cd build
cmake --build .
sudo cmake --install .
shell: bash
- name: Build nvlink_allocator.so
run: |
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
cd mooncake-transfer-engine/nvlink-allocator
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/
shell: bash
- name: Start Metadata Server
run: |
cd mooncake-transfer-engine/example/http-metadata-server-python
pip install aiohttp
python ./bootstrap_server.py &
shell: bash
- name: Run Mooncake Store Rust smoke test and benchmark
run: |
$GITHUB_WORKSPACE/build/mooncake-store/src/mooncake_master \
--eviction_high_watermark_ratio=0.95 \
--cluster_id=ci_rust_test_cluster \
--port 50051 &
MASTER_PID=$!
sleep 3
cd mooncake-store/rust
export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/build/mooncake-asio:$GITHUB_WORKSPACE/build/mooncake-store/src:$GITHUB_WORKSPACE/build/mooncake-store/src/cachelib_memory_allocator:$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src:$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src/common/base:$GITHUB_WORKSPACE/build/mooncake-common/etcd:$LD_LIBRARY_PATH
export MOONCAKE_BUILD_DIR=$GITHUB_WORKSPACE/build
export MOONCAKE_STORE_LIB_DIR=$GITHUB_WORKSPACE/build/mooncake-store/src
export MOONCAKE_STORE_INCLUDE_DIR=$GITHUB_WORKSPACE/mooncake-store/include
export MC_METADATA_SERVER=http://127.0.0.1:8080/metadata
export MC_RUST_STORE_RUN_INTEGRATION=true
export MC_RUST_STORE_MASTER_ADDR=127.0.0.1:50051
export MC_RUST_STORE_LOCAL_HOSTNAME=127.0.0.1
export MC_RUST_STORE_PROTOCOL=tcp
export MC_RUST_STORE_DEVICE_NAME=
cargo test --test minimal_smoke -- --nocapture
MC_RUST_BENCH_ITERATIONS=4 \
MC_RUST_BENCH_VALUE_SIZE=4096 \
MC_RUST_BENCH_WARMUP=1 \
cargo run --release --example store_benchmark
kill $MASTER_PID 2>/dev/null || true
shell: bash
- name: Run Go store binding integration tests
run: |
$GITHUB_WORKSPACE/build/mooncake-store/src/mooncake_master \
--eviction_high_watermark_ratio=0.95 \
--cluster_id=ci_go_test_cluster \
--port 50051 &
MASTER_PID=$!
sleep 3
cd mooncake-store/go
export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/build/mooncake-common:$GITHUB_WORKSPACE/build/mooncake-store/src:$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src:$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src/common/base:$GITHUB_WORKSPACE/build/mooncake-common/etcd
export CGO_ENABLED=1
export CGO_CFLAGS="-I$GITHUB_WORKSPACE/mooncake-store/include -I$GITHUB_WORKSPACE/mooncake-transfer-engine/include"
export CGO_LDFLAGS="-L$GITHUB_WORKSPACE/build/mooncake-store/src -L$GITHUB_WORKSPACE/build/mooncake-store/src/cachelib_memory_allocator -L$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src -L$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src/common/base -L$GITHUB_WORKSPACE/build/mooncake-common -L$GITHUB_WORKSPACE/build/mooncake-common/etcd -lmooncake_store -lcachelib_memory_allocator -ltransfer_engine -lbase -lasio -letcd_wrapper -lstdc++ -lnuma -lglog -lgflags -libverbs -ljsoncpp -lzstd -lcurl -luring -lasan -lm -lgcov"
# Link cudart if CUDA is available (needed for D2H staging in mooncake_store)
if [ -d /usr/local/cuda/lib64 ]; then export CGO_LDFLAGS="$CGO_LDFLAGS -L/usr/local/cuda/lib64 -lcudart"; fi
ASAN_OPTIONS=detect_leaks=0:verify_asan_link_order=0 MC_METADATA_SERVER=http://127.0.0.1:8080/metadata go test -v ./tests/...
kill $MASTER_PID 2>/dev/null || true
shell: bash
- name: Test (in build env) with coverage
run: |
cd build
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
ldconfig -v || echo "always continue"
MC_METADATA_SERVER=http://127.0.0.1:8080/metadata DEFAULT_KV_LEASE_TTL=500 ctest -j --output-on-failure -E ub_transport_test
shell: bash
- name: Drain HTTP E2E test
if: matrix.python-version == '3.12'
run: |
cd build
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
# Keep the sanitizer gate on the C++ integration test. The Python
# drain script is manual/nightly only because pybind + ASan teardown in
# a Python host process is not stable.
DEFAULT_KV_LEASE_TTL=500 ./mooncake-store/tests/task_integration_test --gtest_filter='TaskExecutorIntegrationTest.DrainJobCompleteFlow'
shell: bash
- name: Generate coverage report
id: coverage
run: |
cd build
echo "=== Starting coverage report generation ==="
echo "Current directory: $(pwd)"
echo "=== Looking for .gcda files ==="
find . -name "*.gcda" 2>/dev/null | head -10 || echo "No .gcda files found"
echo "=== Running lcov ==="
lcov --capture --directory . --output-file coverage.info 2>&1 || {
echo "WARNING: lcov failed to capture coverage data"
echo "Creating minimal lcov-compliant coverage file to allow CI to continue"
echo "TN:dummy" > coverage.filtered.info
echo "SF:/dev/null" >> coverage.filtered.info
echo "DA:0,0" >> coverage.filtered.info
echo "end_of_record" >> coverage.filtered.info
echo "coverage_failed=true" >> $GITHUB_OUTPUT
exit 0 # Exit successfully, do not block CI
}
echo "=== Processing coverage data ==="
lcov --remove coverage.info '/usr/*' '*/test/*' '*/third_party/*' --output-file coverage.filtered.info 2>&1 || true
echo "=== Generating HTML report ==="
genhtml coverage.filtered.info --output-directory coverage_report 2>&1 || echo "genhtml failed, continuing..."
echo "=== Coverage summary ==="
lcov --list coverage.filtered.info 2>&1 || echo "lcov list failed"
echo "=== Coverage report generation completed ==="
shell: bash
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
files: build/coverage.filtered.info
flags: unittests
name: code-coverage-report
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: false
continue-on-error: true
- name: Check coverage status
if: always()
run: |
if [ "${{ steps.coverage.outputs.coverage_failed }}" = "true" ]; then
echo "⚠️ Coverage collection failed but CI continued"
echo "::warning::Code coverage collection failed. Please check the build logs."
else
echo "✅ Coverage collected successfully"
fi
- name: Generate Python version tag
id: generate_tag_build
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
# In CI, build_wheel.sh removes build/ to free disk (CI=true); set FREE_BUILD_DIR=1 locally to enable.
- name: Build Python wheel
run: |
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_build.outputs.python_version_tag }} ./scripts/build_wheel.sh
shell: bash
- name: Upload wheel for ZMQ test job
uses: actions/upload-artifact@v4
with:
name: wheel-build-py${{ steps.generate_tag_build.outputs.python_version_tag }}
path: mooncake-wheel/dist-py${{ steps.generate_tag_build.outputs.python_version_tag }}/*.whl
build-musa:
needs: [spell-check, clang-format, check-paths]
if: >-
(needs.check-paths.outputs.should-run-downstream == 'true' ||
github.event_name == 'workflow_dispatch') &&
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Mark repository as safe
run: git config --global --add safe.directory $GITHUB_WORKSPACE
shell: bash
- name: Configure project
run: |
apt update -y
apt install -y ninja-build
bash -x dependencies.sh -y
mkdir build
cd build
cmake -G Ninja .. -DUSE_MUSA=ON -DUSE_MNNVL=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DUSE_CXL=ON -DUSE_TCP=ON -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF
shell: bash
- name: Build project
run: |
cd build
source ~/.bashrc
cmake --build .
cmake --install .
shell: bash
test-wheel-ubuntu:
needs: [spell-check, clang-format, build-flags]
if: >-
needs.build-flags.result == 'success' &&
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
strategy:
matrix:
ubuntu-version: [ubuntu-22.04, ubuntu-24.04]
python-version: ['3.10', '3.12']
runs-on: ${{ matrix.ubuntu-version }}
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Generate Python version tag
id: generate_tag_test
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
- name: Download wheel artifact
uses: actions/download-artifact@v4
with:
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_test.outputs.python_version_tag }}
path: mooncake-wheel/dist
- name: Verify wheel file exists
run: |
ls -la mooncake-wheel/dist/
if [ ! -f mooncake-wheel/dist/*.whl ]; then
echo "ERROR: No wheel file found in mooncake-wheel/dist/"
exit 1
fi
shell: bash
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
df -h
- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.24
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
- name: Run installation test script
run: |
bash scripts/test_installation.sh
shell: bash
- name: Start metadata server
run: |
source test_env/bin/activate
mooncake_http_metadata_server --port 8080 &
shell: bash
- name: Run tests with ssd
run: |
# Reserve port 50052 (mooncake_client RPC port) so the kernel never
# auto-allocates it as ephemeral source port for other outbound
# connections in the test suite. Without this, a random Python test
# connection can pick src_port=50052, leave a TIME_WAIT on
# <eth0_ip>:50052 for 60s, and block mooncake_client's bind to
# 0.0.0.0:50052 even with SO_REUSEADDR (Linux only relaxes
# TIME_WAIT+bind conflict for same-IP or loopback).
sudo sysctl -w net.ipv4.ip_local_reserved_ports=50052
source test_env/bin/activate
MC_STORE_MEMCPY=false TEST_SSD_OFFLOAD_IN_EVICT=true ./scripts/run_tests.sh
rm -rf /tmp/mooncake_test_ssd
deactivate
shell: bash
- name: Start Mooncake Master
run: |
source test_env/bin/activate
mkdir -p /tmp/mooncake_storage
mooncake_master \
--eviction_high_watermark_ratio=0.95 \
--cluster_id=ci_test_cluster \
--port 50051 &
sleep 3
shell: bash
- name: Run Python Tensor API Performance Test (CI check)
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
python scripts/test_tensor_api.py -n 1
shell: bash
- name: Run Python Async API Test (CI check)
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
python scripts/test_async_store.py
shell: bash
- name: Test Mooncake Copy/Move API
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
python scripts/test_copy_move_api.py
shell: bash
- name: Run Python Drain HTTP E2E Test (CI check)
env:
MOONCAKE_MASTER: "127.0.0.1:50051"
MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata"
MOONCAKE_PROTOCOL: "tcp"
LOCAL_HOSTNAME: "127.0.0.1"
run: |
source test_env/bin/activate
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
python scripts/test_drain_http_api.py --timeout-sec 90
shell: bash
- name: Run RPC Communicator Bandwidth Test
run: |
source test_env/bin/activate
python mooncake-transfer-engine/tests/rpc_communicator_test.py server --url 127.0.0.1:9004 --data-size 1 &
SERVER_PID=$!
sleep 5
timeout 10 python mooncake-transfer-engine/tests/rpc_communicator_test.py client --url 127.0.0.1:9004 --threads 2 --data-size 1 || true
kill $SERVER_PID 2>/dev/null || true
wait $SERVER_PID 2>/dev/null || true
- name: Test Mooncake PyTorch Backend (CPU Only)
env:
MC_FORCE_TCP: "true"
run: |
source test_env/bin/activate
python mooncake-pg/tests/test_pg_collectives.py
shell: bash
- name: Test Safetensor Functions
run: |
source test_env/bin/activate
pip install safetensors
python -m unittest mooncake-wheel.tests.test_safetensor_functions
shell: bash
build-flags:
needs: [spell-check, clang-format, check-paths]
if: >-
(needs.check-paths.outputs.should-run-downstream == 'true' ||
github.event_name == 'workflow_dispatch') &&
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.12']
env:
CI: "true"
BUILD_WITH_EP: "1"
TORCH_CUDA_ARCH_LIST: "8.0;9.0"
SCCACHE_GHA_ENABLED: "true"
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
df -h
- name: Install CUDA Toolkit
uses: Jimver/cuda-toolkit@v0.2.24
with:
cuda: '12.8.1'
linux-local-args: '["--toolkit"]'
method: 'network'
sub-packages: '["nvcc", "nvrtc-dev"]'
non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
- name: Run sccache-cache
uses: mozilla-actions/sccache-action@v0.0.9
- name: Configure sccache
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Run sccache stat for check
shell: bash
run: ${SCCACHE_PATH} --show-stats
- name: Install dependencies
run: |
sudo apt update -y
sudo apt install -y ninja-build
sudo bash -x dependencies.sh -y
df -h
shell: bash
- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: Build transfer engine only
run: |
cd mooncake-transfer-engine
mkdir build
cd build
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
cmake -G Ninja .. -DUSE_ETCD=OFF -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DUSE_UB=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
cmake --build .
sudo cmake --install .
df -h
shell: bash
- name: Configure project with all settings are ON
run: |
mkdir build
cd build
cmake -G Ninja .. -DUSE_ETCD=ON -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=ON -DUSE_MNNVL=OFF -DUSE_UB=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
shell: bash
# TODO: lack USE_NVMEOF,USE_MNNVL
- name: Build project with all settings are ON
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
cd build
cmake --build .
sudo cmake --install .
df -h
shell: bash
- name: Configure project with unit tests and examples
run: |
cd build
cmake -G Ninja .. -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DWITH_STORE_RUST=ON -DENABLE_SCCACHE=ON
shell: bash
- name: Build project with unit tests and examples
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
cd build
cmake --build .
sudo cmake --install .
shell: bash
- name: Check Mooncake Store Rust bindings, examples, and tests
run: |
# libcuda.so.1 (SONAME of the CUDA stub) must be findable at runtime.
# The toolkit stubs dir only ships libcuda.so; create the versioned symlink.
if [ -f /usr/local/cuda/lib64/stubs/libcuda.so ] && \
[ ! -e /usr/local/cuda/lib64/stubs/libcuda.so.1 ]; then
sudo ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
fi
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
cd mooncake-store/rust
export MOONCAKE_BUILD_DIR=$GITHUB_WORKSPACE/build
cargo test --lib
MOONCAKE_STORE_LIB_DIR=$GITHUB_WORKSPACE/build/mooncake-store/src \
MOONCAKE_STORE_INCLUDE_DIR=$GITHUB_WORKSPACE/mooncake-store/include \
cargo test --examples --tests --no-run
shell: bash
- name: Configure project
run: |
cd build
rm -r */tests
cmake -G Ninja .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON -DUSE_CXL=ON -DWITH_EP=ON -DEP_TORCH_VERSIONS="2.9.1;2.10.0;2.11.0;2.12.0"
shell: bash
- name: Build project
run: |
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
cd build
cmake --build .
sudo cmake --install .
shell: bash
- name: Configure project with TENT
run: |
mkdir build-tent
cd build-tent
cmake -G Ninja .. -DUSE_TENT=ON -DUSE_HTTP=ON -DENABLE_SCCACHE=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON
shell: bash
- name: Build project with TENT
run: |
cd build-tent
cmake --build .
sudo cmake --install .
shell: bash
- name: Test (TENT)
run: |
cd build-tent
ctest --test-dir mooncake-transfer-engine/tent/tests -j --output-on-failure
shell: bash
- name: Build nvlink_allocator.so
run: |
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
cd mooncake-transfer-engine/nvlink-allocator
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH
bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/
shell: bash
- name: Generate Python version tag
id: generate_tag_flags
run: |
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
shell: bash
# In CI, build_wheel.sh removes build/ to free disk (CI=true); set FREE_BUILD_DIR=1 locally to enable.
- name: Build Python wheel
run: |
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh
shell: bash
- name: Upload Python wheel artifact
uses: actions/upload-artifact@v4
with:
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}
path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
build-docker:
name: Build Docker Image
needs: [spell-check, clang-format, check-paths]
if: >-
(needs.check-paths.outputs.should-run-downstream == 'true' ||
github.event_name == 'workflow_dispatch') &&
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Build Docker image
run: |
docker build -f docker/mooncake.Dockerfile \
--build-arg PYTHON_VERSION=3.10 \
--build-arg EP_TORCH_VERSIONS="2.9.1" \
-t mooncake:from-source .
spell-check:
name: Spell Check with Typos
if: >-
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
runs-on: ubuntu-22.04
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Spell Check Repo
uses: crate-ci/typos@v1.30.2
clang-format:
name: Check code format
if: >-
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
runs-on: ubuntu-22.04
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # Need full history for branch comparison
persist-credentials: false
- name: Install clang-format 20
run: |
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 20
sudo apt-get install -y clang-format-20
- name: Check code format
run: |
# Check script exists and is executable
if [[ ! -x ./scripts/code_format.sh ]]; then
echo "Error: code_format.sh not found or not executable"
exit 1
fi
# Determine base ref for comparison
if [ "${{ github.event_name }}" == "pull_request" ]; then
# For PRs: compare against the target branch
BASE_REF="origin/${{ github.base_ref }}"
else
# For push events: use github.event.before to handle multi-commit pushes
BEFORE_SHA="${{ github.event.before }}"
if [ "${BEFORE_SHA}" == "0000000000000000000000000000000000000000" ]; then
# New branch push, compare against default branch
BASE_REF="origin/${{ github.event.repository.default_branch }}"
else
# Normal push (single or multiple commits)
BASE_REF="${BEFORE_SHA}"
fi
fi
echo "Comparing against: ${BASE_REF}"
./scripts/code_format.sh --check --base "${BASE_REF}"
shell: bash
docs-check:
name: Check Sphinx docs build
if: >-
github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
persist-credentials: false
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
docs:
- 'docs/**'
- 'requirements_docs.txt'
- name: Set up Python
if: steps.filter.outputs.docs == 'true'
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install documentation dependencies
if: steps.filter.outputs.docs == 'true'
run: |
python -m pip install --upgrade pip
pip install -r requirements_docs.txt
- name: Build docs with strict mode
if: steps.filter.outputs.docs == 'true'
run: |
cd docs
make html SPHINXOPTS=-W
shell: bash
check-paths:
if: >-
github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: ubuntu-latest
outputs:
should-run-downstream: ${{ steps.dispatch-override.outputs.src || steps.filter.outputs.src }}
steps:
# workflow_dispatch has no PR/push diff context — skip paths-filter and default to true
- name: Default to true for workflow_dispatch
id: dispatch-override
if: github.event_name == 'workflow_dispatch'
run: echo "src=true" >> $GITHUB_OUTPUT
- uses: actions/checkout@v4
if: github.event_name != 'workflow_dispatch'
with:
fetch-depth: 2
persist-credentials: false
- uses: dorny/paths-filter@v3
if: github.event_name != 'workflow_dispatch'
id: filter
with:
filters: |
src:
- 'mooncake-*/**'
- 'extern/**'
- 'CMakeLists.txt'
- 'dependencies.sh'
- 'scripts/**'
- '.github/workflows/**'
build-wheel-cu13:
needs: [spell-check, clang-format, check-paths]
if: >-
(needs.check-paths.outputs.should-run-downstream == 'true' ||
github.event_name == 'workflow_dispatch') &&
(github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
github.event.action == 'opened' ||
contains(github.event.pull_request.labels.*.name, 'run-ci'))
uses: ./.github/workflows/ci_cu13.yml
secrets: inherit
ascend-test:
needs: [build, check-paths]
if: needs.check-paths.outputs.should-run-downstream == 'true'
uses: ./.github/workflows/ci_ascend.yml
secrets: inherit
integration-test:
needs: [build, check-paths]
if: needs.check-paths.outputs.should-run-downstream == 'true'
uses: ./.github/workflows/integration-test.yml
secrets: inherit
ci-gate:
name: CI Gate
if: always()
needs:
- spell-check
- clang-format
- docs-check
- build
- build-musa
- build-flags
- build-docker
- test-wheel-ubuntu
- build-wheel-cu13
- ascend-test
- integration-test
runs-on: ubuntu-latest
steps:
- name: Check required job results
run: |
failing=$(echo "$NEEDS_JSON" | jq -r '
to_entries[] |
select(.value.result != "success" and .value.result != "skipped") |
"\(.key): \(.value.result)"')
if [ -n "$failing" ]; then
echo "::error::The following jobs failed or were cancelled:"
echo "$failing"
exit 1
fi
echo "All checks passed or were acceptably skipped."
env:
NEEDS_JSON: ${{ toJSON(needs) }}