Skip to content

[metrics] introducing Prometheus metrics endpoint #485

[metrics] introducing Prometheus metrics endpoint

[metrics] introducing Prometheus metrics endpoint #485

Workflow file for this run

name: test-opensrc
permissions:
contents: read
actions: write
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
workflow_dispatch:
inputs:
runs-on:
description: "Runner type"
type: choice
options:
- ubuntu-24.04-arm
- ubuntu-latest
default: ubuntu-latest
rebuildDiskCache:
description: "Rebuild disk cache"
type: boolean
default: false
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ inputs.runs-on || 'ubuntu-latest' }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
jobs:
test:
strategy:
fail-fast: false
matrix:
include:
- name: unit_test
test_target: "//kv_cache_manager/..."
test_args: ""
- name: integration_test
test_target: "//integration_test/..."
test_args: ""
- name: client_test
test_target: "//kv_cache_manager/client/..."
test_args: "--config=client"
name: ${{ matrix.name }}
runs-on: ${{ inputs.runs-on || 'ubuntu-latest' }}
container:
image: ghcr.io/alibaba/tair-kvcache-kvcm-dev:2026_02_13_12_03_24230b1
volumes:
- /:/host_root/
options: --privileged
steps:
# https://github.com/actions/runner-images/issues/2840
# https://github.com/korjavin/haskel-web1/commit/3bd3d2a74c41418ec31a8bec34dcd9c100f4e22d
- name: Free up disk space
run: |
echo "Disk space before cleanup:"
df -h
rm -rf /host_root/usr/share/dotnet
rm -rf /host_root/usr/local/lib/android
rm -rf /host_root/opt/ghc
echo "Disk space after cleanup:"
df -h
- uses: actions/checkout@v4
- uses: bazel-contrib/setup-bazel@0.18.0
with:
bazelisk-cache: true
disk-cache: ${{ runner.os }}-${{ github.workflow }}-${{ matrix.name }}
repository-cache: true
cache-save: ${{ inputs.rebuildDiskCache || false }}
- name: clean_disk_cache
if: ${{ inputs.rebuildDiskCache || false }}
run: |
rm -rf ~/.cache/bazel-disk
- name: setup_coredump
run: |
set -x
# Create directory for core dumps
mkdir -p /tmp/corefile
chmod 777 /tmp/corefile
# Set core pattern to save core dumps with executable name, PID, signal and timestamp
echo "/tmp/corefile/core.%e.%p.%s.%t" > /proc/sys/kernel/core_pattern
# Verify configuration
echo "core_pattern: $(cat /proc/sys/kernel/core_pattern)"
- name: bazel_test
run: |
set -x
set -e
ulimit -c unlimited
bazelisk test ${{ matrix.test_target }} ${{ matrix.test_args }} --cache_test_results=no --test_output=errors
- name: delete_old_disk_cache
if: ${{ inputs.rebuildDiskCache || false }}
env:
GH_TOKEN: ${{ github.token }}
run: |
set -x
# Install gh and jq (not pre-installed in the container image)
ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/')
curl -sLo /usr/local/bin/jq "https://github.com/jqlang/jq/releases/download/jq-1.8.1/jq-linux-${ARCH}" && chmod +x /usr/local/bin/jq
GH_VER=2.89.0
curl -sL "https://github.com/cli/cli/releases/download/v${GH_VER}/gh_${GH_VER}_linux_${ARCH}.tar.gz" | tar xz --strip-components=1 -C /usr/local
# Delete old disk cache so setup-bazel post action can save the new one.
# GitHub Actions cache is immutable - same key cannot be overwritten.
# Match by our disk-cache name and runner arch to avoid depending on
# setup-bazel's internal key format, while not deleting other arches' caches.
RUNNER_ARCH_LOWER=$(echo "${{ runner.arch }}" | tr '[:upper:]' '[:lower:]')
DISK_CACHE_NAME="disk-${{ runner.os }}-${{ github.workflow }}-${{ matrix.name }}"
gh cache list --repo ${{ github.repository }} --json id,key --limit 100 | \
jq -r --arg arch "$RUNNER_ARCH_LOWER" --arg name "$DISK_CACHE_NAME" \
'.[] | select((.key | contains($arch)) and (.key | contains($name))) | .id' | \
xargs -I {} gh cache delete {} --repo ${{ github.repository }} || true
- name: detect_crashes
if: failure()
run: |
set +e
set -x
echo "=== dmesg crash signals ==="
dmesg -T 2>/dev/null | grep -iE 'segfault|killed process|oom-killer|traps:' || echo "No crash signals found in dmesg"
echo ""
echo "=== Core dump files ==="
core_count=$(find /tmp/corefile/ -type f 2>/dev/null | wc -l)
echo "Found $core_count core dump file(s)"
ls -lah /tmp/corefile/ 2>/dev/null || true
# Install gdb and file on-demand if core dumps exist
if [ "$core_count" -gt 0 ]; then
if ! command -v gdb >/dev/null 2>&1 || ! command -v file >/dev/null 2>&1; then
echo "Installing gdb and file..."
if command -v yum >/dev/null 2>&1; then
yum install -y -q gdb file || true
elif command -v apt-get >/dev/null 2>&1; then
apt-get update -qq && apt-get install -y -qq gdb file || true
fi
fi
fi
for core in /tmp/corefile/core.*; do
if [ -f "$core" ]; then
echo ""
echo "=== Core dump: $core ==="
file_output=$(file "$core" || true)
echo "$file_output"
# Note: %e in core_pattern truncates exe name to 15 chars (TASK_COMM_LEN-1)
signal=$(basename "$core" | cut -d. -f4)
# Extract the real executable path from 'file' output (execfn field, not truncated)
exe_path=$(echo "$file_output" | sed -n "s/.*execfn: '\([^']*\)'.*/\1/p")
if [ -n "$exe_path" ] && [ -f "$exe_path" ]; then
echo "Executable: $exe_path, Signal: $signal"
echo "--- Backtrace ---"
gdb -batch -ex "thread apply all bt" "$exe_path" "$core" 2>/dev/null || true
else
echo "Signal: $signal, executable not found at: $exe_path"
echo "--- Backtrace (core only) ---"
gdb -batch -ex "thread apply all bt" -c "$core" 2>/dev/null || true
fi
fi
done
- name: create_archive
if: failure()
run: |
set -x
set +e
rm -f bazel-out.tar
find bazel-out/*-opt/bin/ '(' -name 'kv_cache_manager.log' -o -name 'access.log' -o -name 'metrics.log' -o -name 'stdout' -o -name 'stderr' ')' -a -exec 'tar' '-r' '-f' 'bazel-out.tar' '{}' ';' 2>/dev/null
# Collect core dump files and their corresponding executables for offline debugging
for core in /tmp/corefile/core.*; do
if [ -f "$core" ]; then
tar -r -f bazel-out.tar "$core" 2>/dev/null
exe_path=$(file "$core" 2>/dev/null | sed -n "s/.*execfn: '\([^']*\)'.*/\1/p")
if [ -n "$exe_path" ] && [ -f "$exe_path" ]; then
tar -r -h -f bazel-out.tar "$exe_path" 2>/dev/null
fi
fi
done
tar -r -f bazel-out.tar bazel-out/*-opt/testlogs/ 2>/dev/null
if [ -f bazel-out.tar ]; then
gzip -f bazel-out.tar
fi
- uses: actions/upload-artifact@v6
if: failure()
with:
name: ${{ matrix.name }}-bazel-out.tar.gz
path: ./bazel-out.tar.gz
overwrite: true