[transfer_engine] fix: drain endpoint waiting list via periodic reclaim #6140
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: 'Build & Test (Linux)' | |
| on: | |
| push: | |
| branches: [ "main" ] | |
| pull_request: | |
| branches: [ "main" ] | |
| types: [opened, synchronize, reopened, labeled] | |
| workflow_dispatch: {} | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }} | |
| cancel-in-progress: true | |
| jobs: | |
| build: | |
| needs: [spell-check, clang-format, check-paths] | |
| if: >- | |
| (needs.check-paths.outputs.should-run-downstream == 'true' || | |
| github.event_name == 'workflow_dispatch') && | |
| (github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci')) | |
| runs-on: ubuntu-22.04 | |
| strategy: | |
| matrix: | |
| python-version: ['3.10', '3.12'] | |
| env: | |
| CI: "true" | |
| SCCACHE_GHA_ENABLED: "true" | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| persist-credentials: false | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Install and start etcd | |
| run: | | |
| wget https://github.com/etcd-io/etcd/releases/download/v3.6.1/etcd-v3.6.1-linux-amd64.tar.gz | |
| tar xzf etcd-v3.6.1-linux-amd64.tar.gz | |
| sudo mv etcd-v3.6.1-linux-amd64/etcd* /usr/local/bin/ | |
| etcd --advertise-client-urls http://127.0.0.1:2379 --listen-client-urls http://127.0.0.1:2379 & | |
| sleep 3 # Give etcd time to start | |
| etcdctl --endpoints=http://127.0.0.1:2379 endpoint health | |
| shell: bash | |
| - name: Free up disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| - name: Install CUDA Toolkit | |
| uses: Jimver/cuda-toolkit@v0.2.24 | |
| with: | |
| cuda: '12.8.1' | |
| linux-local-args: '["--toolkit"]' | |
| method: 'network' | |
| sub-packages: '["nvcc"]' | |
| - name: Install coverage tools and build utilities | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y lcov gcovr ninja-build | |
| - name: Set up coverage compilation flags | |
| run: | | |
| echo "Setting up coverage compilation flags..." | |
| echo "CXXFLAGS=--coverage" >> $GITHUB_ENV | |
| echo "CFLAGS=--coverage" >> $GITHUB_ENV | |
| echo "LDFLAGS=--coverage" >> $GITHUB_ENV | |
| shell: bash | |
| - name: Run sccache-cache | |
| uses: mozilla-actions/sccache-action@v0.0.9 | |
| - name: Configure sccache | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || ''); | |
| core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); | |
| - name: Run sccache stat for check | |
| shell: bash | |
| run: ${SCCACHE_PATH} --show-stats | |
| - name: Configure project with coverage support | |
| run: | | |
| sudo apt update -y | |
| sudo bash -x dependencies.sh -y | |
| mkdir build | |
| cd build | |
| cmake -G Ninja .. -DUSE_HTTP=ON -DUSE_CXL=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_ASAN=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Debug | |
| shell: bash | |
| - name: Build project | |
| run: | | |
| cd build | |
| cmake --build . | |
| sudo cmake --install . | |
| shell: bash | |
| - name: Build nvlink_allocator.so | |
| run: | | |
| mkdir -p build/mooncake-transfer-engine/nvlink-allocator | |
| cd mooncake-transfer-engine/nvlink-allocator | |
| export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH | |
| bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/ | |
| shell: bash | |
| - name: Start Metadata Server | |
| run: | | |
| cd mooncake-transfer-engine/example/http-metadata-server-python | |
| pip install aiohttp | |
| python ./bootstrap_server.py & | |
| shell: bash | |
| - name: Run Go store binding integration tests | |
| run: | | |
| $GITHUB_WORKSPACE/build/mooncake-store/src/mooncake_master \ | |
| --eviction_high_watermark_ratio=0.95 \ | |
| --cluster_id=ci_go_test_cluster \ | |
| --port 50051 & | |
| MASTER_PID=$! | |
| sleep 3 | |
| cd mooncake-store/go | |
| export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/build/mooncake-asio:$GITHUB_WORKSPACE/build/mooncake-store/src:$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src:$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src/common/base:$GITHUB_WORKSPACE/build/mooncake-common/etcd | |
| export CGO_ENABLED=1 | |
| export CGO_CFLAGS="-I$GITHUB_WORKSPACE/mooncake-store/include -I$GITHUB_WORKSPACE/mooncake-transfer-engine/include" | |
| export CGO_LDFLAGS="-L$GITHUB_WORKSPACE/build/mooncake-store/src -L$GITHUB_WORKSPACE/build/mooncake-store/src/cachelib_memory_allocator -L$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src -L$GITHUB_WORKSPACE/build/mooncake-transfer-engine/src/common/base -L$GITHUB_WORKSPACE/build/mooncake-asio -L$GITHUB_WORKSPACE/build/mooncake-common/etcd -lmooncake_store -lcachelib_memory_allocator -ltransfer_engine -lbase -lasio -letcd_wrapper -lstdc++ -lnuma -lglog -lgflags -libverbs -ljsoncpp -lzstd -lcurl -luring -lasan -lm -lgcov" | |
| ASAN_OPTIONS=detect_leaks=0:verify_asan_link_order=0 MC_METADATA_SERVER=http://127.0.0.1:8080/metadata go test -v ./tests/... | |
| kill $MASTER_PID 2>/dev/null || true | |
| shell: bash | |
| - name: Test (in build env) with coverage | |
| run: | | |
| cd build | |
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib | |
| ldconfig -v || echo "always continue" | |
| MC_METADATA_SERVER=http://127.0.0.1:8080/metadata DEFAULT_KV_LEASE_TTL=500 ctest -j --output-on-failure | |
| shell: bash | |
| - name: Drain HTTP E2E test | |
| if: matrix.python-version == '3.12' | |
| run: | | |
| cd build | |
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib | |
| # Keep the sanitizer gate on the C++ integration test. The Python | |
| # drain script is manual/nightly only because pybind + ASan teardown in | |
| # a Python host process is not stable. | |
| DEFAULT_KV_LEASE_TTL=500 ./mooncake-store/tests/task_integration_test --gtest_filter='TaskExecutorIntegrationTest.DrainJobCompleteFlow' | |
| shell: bash | |
| - name: Generate coverage report | |
| id: coverage | |
| run: | | |
| cd build | |
| echo "=== Starting coverage report generation ===" | |
| echo "Current directory: $(pwd)" | |
| echo "=== Looking for .gcda files ===" | |
| find . -name "*.gcda" 2>/dev/null | head -10 || echo "No .gcda files found" | |
| echo "=== Running lcov ===" | |
| lcov --capture --directory . --output-file coverage.info 2>&1 || { | |
| echo "WARNING: lcov failed to capture coverage data" | |
| echo "Creating minimal lcov-compliant coverage file to allow CI to continue" | |
| echo "TN:dummy" > coverage.filtered.info | |
| echo "SF:/dev/null" >> coverage.filtered.info | |
| echo "DA:0,0" >> coverage.filtered.info | |
| echo "end_of_record" >> coverage.filtered.info | |
| echo "coverage_failed=true" >> $GITHUB_OUTPUT | |
| exit 0 # Exit successfully, do not block CI | |
| } | |
| echo "=== Processing coverage data ===" | |
| lcov --remove coverage.info '/usr/*' '*/test/*' '*/third_party/*' --output-file coverage.filtered.info 2>&1 || true | |
| echo "=== Generating HTML report ===" | |
| genhtml coverage.filtered.info --output-directory coverage_report 2>&1 || echo "genhtml failed, continuing..." | |
| echo "=== Coverage summary ===" | |
| lcov --list coverage.filtered.info 2>&1 || echo "lcov list failed" | |
| echo "=== Coverage report generation completed ===" | |
| shell: bash | |
| - name: Upload coverage to Codecov | |
| uses: codecov/codecov-action@v4 | |
| with: | |
| files: build/coverage.filtered.info | |
| flags: unittests | |
| name: code-coverage-report | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| fail_ci_if_error: false | |
| continue-on-error: true | |
| - name: Check coverage status | |
| if: always() | |
| run: | | |
| if [ "${{ steps.coverage.outputs.coverage_failed }}" = "true" ]; then | |
| echo "⚠️ Coverage collection failed but CI continued" | |
| echo "::warning::Code coverage collection failed. Please check the build logs." | |
| else | |
| echo "✅ Coverage collected successfully" | |
| fi | |
| - name: Generate Python version tag | |
| id: generate_tag_build | |
| run: | | |
| echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT | |
| shell: bash | |
| # In CI, build_wheel.sh removes build/ to free disk (CI=true); set FREE_BUILD_DIR=1 locally to enable. | |
| - name: Build Python wheel | |
| run: | | |
| PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_build.outputs.python_version_tag }} ./scripts/build_wheel.sh | |
| shell: bash | |
| - name: Upload wheel for ZMQ test job | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: wheel-build-py${{ steps.generate_tag_build.outputs.python_version_tag }} | |
| path: mooncake-wheel/dist-py${{ steps.generate_tag_build.outputs.python_version_tag }}/*.whl | |
| build-musa: | |
| needs: [spell-check, clang-format, check-paths] | |
| if: >- | |
| (needs.check-paths.outputs.should-run-downstream == 'true' || | |
| github.event_name == 'workflow_dispatch') && | |
| (github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci')) | |
| runs-on: ubuntu-22.04 | |
| container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| persist-credentials: false | |
| - name: Mark repository as safe | |
| run: git config --global --add safe.directory $GITHUB_WORKSPACE | |
| shell: bash | |
| - name: Configure project | |
| run: | | |
| apt update -y | |
| apt install -y ninja-build | |
| bash -x dependencies.sh -y | |
| mkdir build | |
| cd build | |
| cmake -G Ninja .. -DUSE_MUSA=ON -DUSE_MNNVL=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DUSE_CXL=ON -DUSE_TCP=ON -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF | |
| shell: bash | |
| - name: Build project | |
| run: | | |
| cd build | |
| source ~/.bashrc | |
| cmake --build . | |
| cmake --install . | |
| shell: bash | |
| test-wheel-ubuntu: | |
| needs: [spell-check, clang-format, build-flags] | |
| if: >- | |
| needs.build-flags.result == 'success' && | |
| (github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci')) | |
| strategy: | |
| matrix: | |
| ubuntu-version: [ubuntu-22.04, ubuntu-24.04] | |
| python-version: ['3.10', '3.12'] | |
| runs-on: ${{ matrix.ubuntu-version }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| persist-credentials: false | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Generate Python version tag | |
| id: generate_tag_test | |
| run: | | |
| echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT | |
| shell: bash | |
| - name: Download wheel artifact | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_test.outputs.python_version_tag }} | |
| path: mooncake-wheel/dist | |
| - name: Verify wheel file exists | |
| run: | | |
| ls -la mooncake-wheel/dist/ | |
| if [ ! -f mooncake-wheel/dist/*.whl ]; then | |
| echo "ERROR: No wheel file found in mooncake-wheel/dist/" | |
| exit 1 | |
| fi | |
| shell: bash | |
| - name: Free up disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| sudo rm -rf /usr/local/lib/android | |
| df -h | |
| - name: Install CUDA Toolkit | |
| uses: Jimver/cuda-toolkit@v0.2.24 | |
| with: | |
| cuda: '12.8.1' | |
| linux-local-args: '["--toolkit"]' | |
| method: 'network' | |
| - name: Run installation test script | |
| run: | | |
| bash scripts/test_installation.sh | |
| shell: bash | |
| - name: Start metadata server | |
| run: | | |
| source test_env/bin/activate | |
| mooncake_http_metadata_server --port 8080 & | |
| shell: bash | |
| - name: Run tests with ssd | |
| run: | | |
| source test_env/bin/activate | |
| MC_STORE_MEMCPY=false TEST_SSD_OFFLOAD_IN_EVICT=true ./scripts/run_tests.sh | |
| rm -rf /tmp/mooncake_test_ssd | |
| deactivate | |
| shell: bash | |
| - name: Start Mooncake Master | |
| run: | | |
| source test_env/bin/activate | |
| mkdir -p /tmp/mooncake_storage | |
| mooncake_master \ | |
| --eviction_high_watermark_ratio=0.95 \ | |
| --cluster_id=ci_test_cluster \ | |
| --port 50051 & | |
| sleep 3 | |
| shell: bash | |
| - name: Run Python Tensor API Performance Test (CI check) | |
| env: | |
| MOONCAKE_MASTER: "127.0.0.1:50051" | |
| MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata" | |
| MOONCAKE_PROTOCOL: "tcp" | |
| LOCAL_HOSTNAME: "127.0.0.1" | |
| run: | | |
| source test_env/bin/activate | |
| python scripts/test_tensor_api.py -n 1 | |
| shell: bash | |
| - name: Run Python Async API Test (CI check) | |
| env: | |
| MOONCAKE_MASTER: "127.0.0.1:50051" | |
| MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata" | |
| MOONCAKE_PROTOCOL: "tcp" | |
| LOCAL_HOSTNAME: "127.0.0.1" | |
| run: | | |
| source test_env/bin/activate | |
| python scripts/test_async_store.py | |
| shell: bash | |
| - name: Test Mooncake Copy/Move API | |
| env: | |
| MOONCAKE_MASTER: "127.0.0.1:50051" | |
| MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata" | |
| MOONCAKE_PROTOCOL: "tcp" | |
| LOCAL_HOSTNAME: "127.0.0.1" | |
| run: | | |
| source test_env/bin/activate | |
| python scripts/test_copy_move_api.py | |
| shell: bash | |
| - name: Run Python Drain HTTP E2E Test (CI check) | |
| env: | |
| MOONCAKE_MASTER: "127.0.0.1:50051" | |
| MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata" | |
| MOONCAKE_PROTOCOL: "tcp" | |
| LOCAL_HOSTNAME: "127.0.0.1" | |
| run: | | |
| source test_env/bin/activate | |
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib | |
| python scripts/test_drain_http_api.py --timeout-sec 90 | |
| shell: bash | |
| - name: Run RPC Communicator Bandwidth Test | |
| run: | | |
| source test_env/bin/activate | |
| python mooncake-transfer-engine/tests/rpc_communicator_test.py server --url 127.0.0.1:9004 --data-size 1 & | |
| SERVER_PID=$! | |
| sleep 5 | |
| timeout 10 python mooncake-transfer-engine/tests/rpc_communicator_test.py client --url 127.0.0.1:9004 --threads 2 --data-size 1 || true | |
| kill $SERVER_PID 2>/dev/null || true | |
| wait $SERVER_PID 2>/dev/null || true | |
| - name: Test Mooncake PyTorch Backend (CPU Only) | |
| env: | |
| MC_FORCE_TCP: "true" | |
| run: | | |
| source test_env/bin/activate | |
| python -m unittest mooncake-wheel.tests.test_mooncake_backend_cpu | |
| shell: bash | |
| - name: Test Safetensor Functions | |
| run: | | |
| source test_env/bin/activate | |
| pip install safetensors | |
| python -m unittest mooncake-wheel.tests.test_safetensor_functions | |
| shell: bash | |
| build-flags: | |
| needs: [spell-check, clang-format, check-paths] | |
| if: >- | |
| (needs.check-paths.outputs.should-run-downstream == 'true' || | |
| github.event_name == 'workflow_dispatch') && | |
| (github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci')) | |
| runs-on: ubuntu-22.04 | |
| strategy: | |
| matrix: | |
| python-version: ['3.10', '3.12'] | |
| env: | |
| CI: "true" | |
| BUILD_WITH_EP: "1" | |
| TORCH_CUDA_ARCH_LIST: "8.0;9.0" | |
| SCCACHE_GHA_ENABLED: "true" | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| persist-credentials: false | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Free up disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| sudo rm -rf /usr/local/lib/android | |
| df -h | |
| - name: Install CUDA Toolkit | |
| uses: Jimver/cuda-toolkit@v0.2.24 | |
| with: | |
| cuda: '12.8.1' | |
| linux-local-args: '["--toolkit"]' | |
| method: 'network' | |
| sub-packages: '["nvcc", "nvrtc-dev"]' | |
| non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]' | |
| - name: Run sccache-cache | |
| uses: mozilla-actions/sccache-action@v0.0.9 | |
| - name: Configure sccache | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || ''); | |
| core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); | |
| - name: Run sccache stat for check | |
| shell: bash | |
| run: ${SCCACHE_PATH} --show-stats | |
| - name: Install dependencies | |
| run: | | |
| sudo apt update -y | |
| sudo apt install -y ninja-build | |
| sudo bash -x dependencies.sh -y | |
| df -h | |
| shell: bash | |
| - name: Install Rust toolchain | |
| uses: dtolnay/rust-toolchain@stable | |
| - name: Build transfer engine only | |
| run: | | |
| cd mooncake-transfer-engine | |
| mkdir build | |
| cd build | |
| export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH | |
| export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH | |
| cmake -G Ninja .. -DUSE_ETCD=OFF -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs" | |
| cmake --build . | |
| sudo cmake --install . | |
| df -h | |
| shell: bash | |
| - name: Configure project with all settings are ON | |
| run: | | |
| mkdir build | |
| cd build | |
| cmake -G Ninja .. -DUSE_ETCD=ON -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=ON -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs" | |
| shell: bash | |
| # TODO: lack USE_NVMEOF,USE_MNNVL | |
| - name: Build project with all settings are ON | |
| run: | | |
| export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH | |
| export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH | |
| cd build | |
| cmake --build . | |
| sudo cmake --install . | |
| df -h | |
| shell: bash | |
| - name: Configure project with unit tests and examples | |
| run: | | |
| cd build | |
| cmake -G Ninja .. -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DWITH_STORE_RUST=ON -DENABLE_SCCACHE=ON | |
| shell: bash | |
| - name: Build project with unit tests and examples | |
| run: | | |
| export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH | |
| export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH | |
| cd build | |
| cmake --build . | |
| sudo cmake --install . | |
| shell: bash | |
| - name: Check Mooncake Store Rust bindings and example | |
| run: | | |
| cd mooncake-store/rust | |
| MOONCAKE_STORE_LIB_DIR=$GITHUB_WORKSPACE/build/mooncake-store/src \ | |
| MOONCAKE_STORE_INCLUDE_DIR=$GITHUB_WORKSPACE/mooncake-store/include \ | |
| cargo check --example basic_usage --tests | |
| shell: bash | |
| - name: Configure project | |
| run: | | |
| cd build | |
| rm -r */tests | |
| cmake -G Ninja .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON -DUSE_CXL=ON -DWITH_EP=ON -DEP_TORCH_VERSIONS="2.9.0;2.9.1;2.10.0;2.11.0" | |
| shell: bash | |
| - name: Build project | |
| run: | | |
| export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH | |
| export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH | |
| cd build | |
| cmake --build . | |
| sudo cmake --install . | |
| shell: bash | |
| - name: Build nvlink_allocator.so | |
| run: | | |
| mkdir -p build/mooncake-transfer-engine/nvlink-allocator | |
| cd mooncake-transfer-engine/nvlink-allocator | |
| export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH | |
| export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH | |
| export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LIBRARY_PATH | |
| bash build.sh ../../build/mooncake-transfer-engine/nvlink-allocator/ | |
| shell: bash | |
| - name: Generate Python version tag | |
| id: generate_tag_flags | |
| run: | | |
| echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT | |
| shell: bash | |
| # In CI, build_wheel.sh removes build/ to free disk (CI=true); set FREE_BUILD_DIR=1 locally to enable. | |
| - name: Build Python wheel | |
| run: | | |
| PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh | |
| shell: bash | |
| - name: Upload Python wheel artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }} | |
| path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl | |
| build-docker: | |
| name: Build Docker Image | |
| needs: [spell-check, clang-format, check-paths] | |
| if: >- | |
| (needs.check-paths.outputs.should-run-downstream == 'true' || | |
| github.event_name == 'workflow_dispatch') && | |
| (github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci')) | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| persist-credentials: false | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v2 | |
| - name: Build Docker image | |
| run: | | |
| docker build -f docker/mooncake.Dockerfile \ | |
| --build-arg PYTHON_VERSION=3.10 \ | |
| --build-arg EP_TORCH_VERSIONS="2.9.1" \ | |
| -t mooncake:from-source . | |
| spell-check: | |
| name: Spell Check with Typos | |
| if: >- | |
| github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci') | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Checkout Actions Repository | |
| uses: actions/checkout@v4 | |
| with: | |
| persist-credentials: false | |
| - name: Spell Check Repo | |
| uses: crate-ci/typos@v1.30.2 | |
| clang-format: | |
| name: Check code format | |
| if: >- | |
| github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci') | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Checkout Actions Repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Need full history for branch comparison | |
| persist-credentials: false | |
| - name: Install clang-format 20 | |
| run: | | |
| wget https://apt.llvm.org/llvm.sh | |
| chmod +x llvm.sh | |
| sudo ./llvm.sh 20 | |
| sudo apt-get install -y clang-format-20 | |
| - name: Check code format | |
| run: | | |
| # Check script exists and is executable | |
| if [[ ! -x ./scripts/code_format.sh ]]; then | |
| echo "Error: code_format.sh not found or not executable" | |
| exit 1 | |
| fi | |
| # Determine base ref for comparison | |
| if [ "${{ github.event_name }}" == "pull_request" ]; then | |
| # For PRs: compare against the target branch | |
| BASE_REF="origin/${{ github.base_ref }}" | |
| else | |
| # For push events: use github.event.before to handle multi-commit pushes | |
| BEFORE_SHA="${{ github.event.before }}" | |
| if [ "${BEFORE_SHA}" == "0000000000000000000000000000000000000000" ]; then | |
| # New branch push, compare against default branch | |
| BASE_REF="origin/${{ github.event.repository.default_branch }}" | |
| else | |
| # Normal push (single or multiple commits) | |
| BASE_REF="${BEFORE_SHA}" | |
| fi | |
| fi | |
| echo "Comparing against: ${BASE_REF}" | |
| ./scripts/code_format.sh --check --base "${BASE_REF}" | |
| shell: bash | |
| check-paths: | |
| if: >- | |
| github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci') | |
| runs-on: ubuntu-latest | |
| outputs: | |
| should-run-downstream: ${{ steps.dispatch-override.outputs.src || steps.filter.outputs.src }} | |
| steps: | |
| # workflow_dispatch has no PR/push diff context — skip paths-filter and default to true | |
| - name: Default to true for workflow_dispatch | |
| id: dispatch-override | |
| if: github.event_name == 'workflow_dispatch' | |
| run: echo "src=true" >> $GITHUB_OUTPUT | |
| - uses: actions/checkout@v4 | |
| if: github.event_name != 'workflow_dispatch' | |
| with: | |
| fetch-depth: 2 | |
| persist-credentials: false | |
| - uses: dorny/paths-filter@v3 | |
| if: github.event_name != 'workflow_dispatch' | |
| id: filter | |
| with: | |
| filters: | | |
| src: | |
| - 'mooncake-*/**' | |
| - 'extern/**' | |
| - 'CMakeLists.txt' | |
| - 'dependencies.sh' | |
| - 'scripts/**' | |
| - '.github/workflows/**' | |
| build-wheel-cu13: | |
| needs: [spell-check, clang-format, check-paths] | |
| if: >- | |
| (needs.check-paths.outputs.should-run-downstream == 'true' || | |
| github.event_name == 'workflow_dispatch') && | |
| (github.event_name == 'push' || | |
| github.event_name == 'workflow_dispatch' || | |
| github.event.action == 'opened' || | |
| contains(github.event.pull_request.labels.*.name, 'run-ci')) | |
| uses: ./.github/workflows/ci_cu13.yml | |
| secrets: inherit | |
| ci-gate: | |
| name: CI Gate | |
| if: always() | |
| needs: | |
| - spell-check | |
| - clang-format | |
| - build | |
| - build-musa | |
| - build-flags | |
| - build-docker | |
| - test-wheel-ubuntu | |
| - build-wheel-cu13 | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check required job results | |
| run: | | |
| failing=$(echo "$NEEDS_JSON" | jq -r ' | |
| to_entries[] | | |
| select(.value.result != "success" and .value.result != "skipped") | | |
| "\(.key): \(.value.result)"') | |
| if [ -n "$failing" ]; then | |
| echo "::error::The following jobs failed or were cancelled:" | |
| echo "$failing" | |
| exit 1 | |
| fi | |
| echo "All checks passed or were acceptably skipped." | |
| env: | |
| NEEDS_JSON: ${{ toJSON(needs) }} |