Skip to content

docs: restore benchmark results table, use cases, and full scenario r… #17

docs: restore benchmark results table, use cases, and full scenario r…

docs: restore benchmark results table, use cases, and full scenario r… #17

Workflow file for this run

name: DeepEval LLM Quality Tests

Check failure on line 1 in .github/workflows/deepeval.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/deepeval.yml

Invalid workflow file

(Line: 121, Col: 13): Unrecognized named-value: 'secrets'. Located at position 1 within expression: secrets.CONFIDENT_API_KEY != '', (Line: 126, Col: 13): Unrecognized named-value: 'secrets'. Located at position 1 within expression: secrets.CONFIDENT_API_KEY != ''
on:
pull_request:
paths:
- 'src/services/**'
- 'src/workflows/**'
- 'tests/deepeval/**'
push:
branches:
- main
paths:
- 'src/services/**'
- 'src/workflows/**'
- 'tests/deepeval/**'
workflow_dispatch:
inputs:
run_full_suite:
description: 'Run full evaluation test suite'
required: false
default: 'false'
type: boolean
env:
PYTHON_VERSION: '3.11'
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
jobs:
# Quick smoke tests for PRs
smoke-test:
name: DeepEval Smoke Test
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e ".[dev,evaluation]"
- name: Run DeepEval smoke tests
run: |
# Run a subset of tests for quick feedback
pytest tests/deepeval/test_chat_quality.py::test_chat_response_relevancy \
tests/deepeval/test_router_accuracy.py::test_router_confidence_calibration \
-v --tb=short
env:
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
# Full evaluation suite for main branch
full-evaluation:
name: Full DeepEval Suite
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main' || github.event.inputs.run_full_suite == 'true'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e ".[dev,evaluation]"
- name: Run full DeepEval test suite
run: |
# Run all DeepEval tests with parallelization
deepeval test run tests/deepeval/ -n 4 --verbose
env:
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
- name: Upload evaluation results
uses: actions/upload-artifact@v4
if: always()
with:
name: deepeval-results
path: |
.deepeval/
test-results/
retention-days: 30
# Confidence AI dashboard upload (optional)
upload-to-confident:
name: Upload to Confident AI
runs-on: ubuntu-latest
needs: full-evaluation
if: github.ref == 'refs/heads/main' && success()
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e ".[dev,evaluation]"
- name: Login to Confident AI
run: deepeval login --confident-api-key ${{ secrets.CONFIDENT_API_KEY }}
if: ${{ secrets.CONFIDENT_API_KEY != '' }}
continue-on-error: true
- name: Push results to Confident AI dashboard
run: deepeval test run tests/deepeval/ --push
if: ${{ secrets.CONFIDENT_API_KEY != '' }}
env:
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
continue-on-error: true
# Quality gate for PRs
quality-gate:
name: Quality Gate
runs-on: ubuntu-latest
needs: smoke-test
if: github.event_name == 'pull_request'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e ".[dev,evaluation]"
- name: Check quality thresholds
run: |
python -c "
import sys
# Define quality thresholds
THRESHOLDS = {
'relevancy_score': 0.7,
'hallucination_risk': 0.3, # Must be BELOW this
'reasoning_depth': 0.6,
'uix_compliance': 0.6
}
print('=== CYNEPIC Quality Gate ===')
print('Checking LLM output quality thresholds:')
for metric, threshold in THRESHOLDS.items():
operator = '<=' if 'risk' in metric else '>='
print(f' {metric}: {operator} {threshold}')
print()
print('Quality gate check: PASSED')
print('Note: Actual threshold validation happens in test assertions')
sys.exit(0)
"