Skip to content

CI Test on ASCEND Platform #717

CI Test on ASCEND Platform

CI Test on ASCEND Platform #717

Workflow file for this run

name: 'CI Test on ASCEND Platform'
on:
workflow_run:
workflows: ["Build & Test (Linux)"]
types:
- completed
permissions:
statuses: write
jobs:
check-paths:
runs-on: ubuntu-latest
outputs:
should-run: ${{ steps.filter.outputs.changed }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha }}
fetch-depth: 0
- name: Check changed files
id: filter
uses: dorny/paths-filter@v3
with:
base: ${{ github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.pull_requests[0].base.ref || 'main' }}
ref: ${{ github.event.workflow_run.head_sha }}
filters: |
changed:
- 'mooncake-*/**'
- 'extern/**'
- 'CMakeLists.txt'
- 'scripts/**'
- '.github/workflows/**'
build-and-test:
needs: check-paths
if: ${{ github.event.workflow_run.conclusion == 'success' && github.repository == 'kvcache-ai/Mooncake' && needs.check-paths.outputs.should-run == 'true' }}
runs-on: self-hosted
container:
image: localhost:5000/mooncake-hixl-ci:v5
options: --privileged --user 0:0 --device /dev/davinci0 --device /dev/davinci1 --device /dev/davinci2 --device /dev/davinci3
--device /dev/davinci4 --device /dev/davinci5 --device /dev/davinci6 --device /dev/davinci7
--device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc --ulimit nproc=65535:65535
env:
GITHUB_ACTIONS: "true"
LD_PRELOAD: "/usr/lib64/libjemalloc.so.2:"
volumes:
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/Ascend/driver/:/usr/local/Ascend/driver/
- /etc/ascend_install.info:/etc/ascend_install.info
- /etc/hccn.conf:/etc/hccn.conf
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 1
persist-credentials: false
- name: Configure CMake
shell: bash
run: |
source /usr/local/Ascend/cann-9.0.0/set_env.sh
pwd
if ! git submodule update --init --recursive; then
if [ ! -d "extern/pybind11" ] || [ -z "$(ls -A 'extern/pybind11' 2>/dev/null)" ]; then
echo "git submodule update failed, try to cp pybind11..."
if [ -d "../pybind11" ]; then
cp -r ../pybind11 extern/
else
echo "Error: ../pybind11 does not exist. Cannot copy pybind11."
exit 1
fi
else
echo "Detected that extern/pybind11 already exists, continuing execution...."
fi
fi
bash scripts/ascend/dependencies_ascend_installation.sh
echo "Configuring CMake..."
rm -rf build
mkdir -p build
cd build
cmake .. \
-DUSE_ASCEND_DIRECT=ON \
-DBUILD_EXAMPLES=OFF \
-DBUILD_UNIT_TESTS=OFF
- name: Build
shell: bash
run: |
source /usr/local/Ascend/cann-9.0.0/set_env.sh
echo "Building..."
cd build
cmake --build . -j$(nproc)
cmake --install .
echo "Mooncake installed successfully."
- name: Run Hixl Mooncake Store Test
shell: bash
run: |
source /usr/local/Ascend/cann-9.0.0/set_env.sh
set -e
export ASCEND_PROCESS_LOG_PATH=/tmp/hixl-test-log/
export ASCEND_GLOBAL_LOG_LEVEL=3
echo "=== Cloning Hixl repository ==="
cd ..
rm -rf hixl
git clone https://gitcode.com/cann/hixl.git
cd hixl/examples/third_parties/mooncake_store/python/
export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
echo "=== Starting Mooncake Master ==="
# Find mooncake_master binary
MOONCAKE_MASTER=$(find /usr/local/bin /usr/bin -name "mooncake_master" -type f 2>/dev/null | head -1)
if [ -z "$MOONCAKE_MASTER" ]; then
# Try finding in build directory
MOONCAKE_MASTER=$(find $GITHUB_WORKSPACE/build -name "mooncake_master" -type f 2>/dev/null | head -1)
fi
if [ -z "$MOONCAKE_MASTER" ]; then
echo "Error: mooncake_master binary not found"
exit 1
fi
echo "Found mooncake_master at: $MOONCAKE_MASTER"
# Start Mooncake master in background
$MOONCAKE_MASTER \
--enable_http_metadata_server=true \
--http_metadata_server_host=0.0.0.0 \
--http_metadata_server_port=8080 \
> /tmp/mooncake_master.log 2>&1 &
MASTER_PID=$!
echo "Mooncake Master started with PID: $MASTER_PID"
# Wait for master to be ready
echo "Waiting for Mooncake Master to initialize..."
sleep 5
# Check if master is running
if ! kill -0 $MASTER_PID 2>/dev/null; then
echo "Error: Mooncake Master failed to start"
cat /tmp/mooncake_master.log
exit 1
fi
echo "Mooncake Master is running"
echo "=== Running Hixl Mooncake Store Tests ==="
# List of test cases to run
TEST_CASES=(
"batch_put_get_sample.py"
"batch_put_get_multi_buffers_sample.py"
)
# List of test scenarios (HCCL_INTRA_ROCE_ENABLE settings)
TEST_SCENARIOS=(
"HCCL_INTRA_ROCE_ENABLE=1"
"HCCL_INTRA_ROCE_ENABLE_UNSET"
)
# Track test results
FAILED_TESTS=()
PASSED_TESTS=()
# Run each test scenario
for scenario in "${TEST_SCENARIOS[@]}"; do
echo ""
echo "========================================="
echo "Running scenario: $scenario"
echo "========================================="
# Configure environment variables for the current scenario
if [ "$scenario" = "HCCL_INTRA_ROCE_ENABLE=1" ]; then
export HCCL_INTRA_ROCE_ENABLE=1
unset ASCEND_BUFFER_POOL
echo "HCCL_INTRA_ROCE_ENABLE is set to 1, ASCEND_BUFFER_POOL is unset"
else
unset HCCL_INTRA_ROCE_ENABLE
export ASCEND_BUFFER_POOL=4:8
echo "HCCL_INTRA_ROCE_ENABLE is not set, ASCEND_BUFFER_POOL is set to 4:8"
fi
# Run each test case in the current scenario
for test_case in "${TEST_CASES[@]}"; do
echo ""
echo "-----------------------------------------"
echo "Test: $test_case"
echo "-----------------------------------------"
if [ ! -f "$test_case" ]; then
echo "Warning: Test file $test_case not found, skipping..."
continue
fi
# Run the test with 2 devices in distributed mode
# Run rank 0 on device 0
python3 $test_case \
--device_id=0 \
--rank=0 \
--world_size=2 \
--distributed \
2>&1 | tee "/tmp/hixl_test_${scenario//=/}_${test_case%.py}_rank0.log" &
PID0=$!
# Run rank 1 on device 1
python3 $test_case \
--device_id=2 \
--rank=1 \
--world_size=2 \
--distributed \
2>&1 | tee "/tmp/hixl_test_${scenario//=/}_${test_case%.py}_rank1.log" &
PID1=$!
# Wait for both processes to complete
wait $PID0
TEST_RESULT0=$?
wait $PID1
TEST_RESULT1=$?
# Check test results
if [ $TEST_RESULT0 -eq 0 ] && [ $TEST_RESULT1 -eq 0 ]; then
echo "✓ $test_case PASSED (scenario: $scenario)"
PASSED_TESTS+=("$scenario:$test_case")
else
echo "✗ $test_case FAILED (scenario: $scenario)"
if [ $TEST_RESULT0 -ne 0 ]; then
echo " Rank 0 failed with code: $TEST_RESULT0"
fi
if [ $TEST_RESULT1 -ne 0 ]; then
echo " Rank 1 failed with code: $TEST_RESULT1"
fi
FAILED_TESTS+=("$scenario:$test_case")
fi
done
done
echo ""
echo "========================================="
echo "Test Summary"
echo "========================================="
echo "Passed tests: ${#PASSED_TESTS[@]}"
for test in "${PASSED_TESTS[@]}"; do
echo " ✓ $test"
done
echo ""
echo "Failed tests: ${#FAILED_TESTS[@]}"
for test in "${FAILED_TESTS[@]}"; do
echo " ✗ $test"
done
# Cleanup: Stop Mooncake Master
echo ""
echo "Stopping Mooncake Master..."
kill $MASTER_PID 2>/dev/null || true
wait $MASTER_PID 2>/dev/null || true
# Exit with error if any tests failed
if [ ${#FAILED_TESTS[@]} -gt 0 ]; then
echo ""
echo "Some tests failed!"
exit 1
fi
echo ""
echo "All Hixl Mooncake Store tests completed successfully!"
- name: Test Summary
if: always()
shell: bash
run: |
echo "CI Test completed"
- name: Upload Test Logs
if: always()
uses: actions/upload-artifact@v4
with:
name: test-logs-${{ github.run_number }}
path: |
/tmp/hixl-test-log/*
retention-days: 30
if-no-files-found: warn
report-status:
needs: [check-paths, build-and-test]
if: always()
runs-on: ubuntu-latest
steps:
- uses: actions/github-script@v7
with:
script: |
const conclusion = context.payload.workflow_run.conclusion;
const shouldRun = '${{ needs.check-paths.outputs.should-run }}';
const testResult = '${{ needs.build-and-test.result }}';
const isTargetRepo = '${{ github.repository }}' === 'kvcache-ai/Mooncake';
let state, description;
if (conclusion === 'cancelled') {
state = 'failure';
description = 'CI cancelled (format/spell check failed)';
} else if (conclusion !== 'success') {
state = 'failure';
description = 'CI ' + conclusion + ', Ascend test skipped';
} else if (!isTargetRepo) {
state = 'success';
description = 'Skipped (fork repository)';
} else if (shouldRun !== 'true') {
state = 'success';
description = 'Skipped (no relevant file changes)';
} else if (testResult === 'success') {
state = 'success';
description = 'Ascend test passed';
} else if (testResult === 'skipped') {
state = 'success';
description = 'Skipped';
} else {
state = 'failure';
description = `Ascend test ${testResult}`;
}
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.payload.workflow_run.head_sha,
state: state,
context: 'CI Test on ASCEND / build-and-test',
description: description,
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
});