docs: restore benchmark results table, use cases, and full scenario r… #17
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: DeepEval LLM Quality Tests | ||
|
Check failure on line 1 in .github/workflows/deepeval.yml
|
||
| on: | ||
| pull_request: | ||
| paths: | ||
| - 'src/services/**' | ||
| - 'src/workflows/**' | ||
| - 'tests/deepeval/**' | ||
| push: | ||
| branches: | ||
| - main | ||
| paths: | ||
| - 'src/services/**' | ||
| - 'src/workflows/**' | ||
| - 'tests/deepeval/**' | ||
| workflow_dispatch: | ||
| inputs: | ||
| run_full_suite: | ||
| description: 'Run full evaluation test suite' | ||
| required: false | ||
| default: 'false' | ||
| type: boolean | ||
| env: | ||
| PYTHON_VERSION: '3.11' | ||
| DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | ||
| jobs: | ||
| # Quick smoke tests for PRs | ||
| smoke-test: | ||
| name: DeepEval Smoke Test | ||
| runs-on: ubuntu-latest | ||
| if: github.event_name == 'pull_request' | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ env.PYTHON_VERSION }} | ||
| cache: 'pip' | ||
| - name: Install dependencies | ||
| run: | | ||
| pip install --upgrade pip | ||
| pip install -e ".[dev,evaluation]" | ||
| - name: Run DeepEval smoke tests | ||
| run: | | ||
| # Run a subset of tests for quick feedback | ||
| pytest tests/deepeval/test_chat_quality.py::test_chat_response_relevancy \ | ||
| tests/deepeval/test_router_accuracy.py::test_router_confidence_calibration \ | ||
| -v --tb=short | ||
| env: | ||
| DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | ||
| # Full evaluation suite for main branch | ||
| full-evaluation: | ||
| name: Full DeepEval Suite | ||
| runs-on: ubuntu-latest | ||
| if: github.ref == 'refs/heads/main' || github.event.inputs.run_full_suite == 'true' | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ env.PYTHON_VERSION }} | ||
| cache: 'pip' | ||
| - name: Install dependencies | ||
| run: | | ||
| pip install --upgrade pip | ||
| pip install -e ".[dev,evaluation]" | ||
| - name: Run full DeepEval test suite | ||
| run: | | ||
| # Run all DeepEval tests with parallelization | ||
| deepeval test run tests/deepeval/ -n 4 --verbose | ||
| env: | ||
| DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | ||
| - name: Upload evaluation results | ||
| uses: actions/upload-artifact@v4 | ||
| if: always() | ||
| with: | ||
| name: deepeval-results | ||
| path: | | ||
| .deepeval/ | ||
| test-results/ | ||
| retention-days: 30 | ||
| # Confidence AI dashboard upload (optional) | ||
| upload-to-confident: | ||
| name: Upload to Confident AI | ||
| runs-on: ubuntu-latest | ||
| needs: full-evaluation | ||
| if: github.ref == 'refs/heads/main' && success() | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ env.PYTHON_VERSION }} | ||
| cache: 'pip' | ||
| - name: Install dependencies | ||
| run: | | ||
| pip install --upgrade pip | ||
| pip install -e ".[dev,evaluation]" | ||
| - name: Login to Confident AI | ||
| run: deepeval login --confident-api-key ${{ secrets.CONFIDENT_API_KEY }} | ||
| if: ${{ secrets.CONFIDENT_API_KEY != '' }} | ||
| continue-on-error: true | ||
| - name: Push results to Confident AI dashboard | ||
| run: deepeval test run tests/deepeval/ --push | ||
| if: ${{ secrets.CONFIDENT_API_KEY != '' }} | ||
| env: | ||
| DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | ||
| CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }} | ||
| continue-on-error: true | ||
| # Quality gate for PRs | ||
| quality-gate: | ||
| name: Quality Gate | ||
| runs-on: ubuntu-latest | ||
| needs: smoke-test | ||
| if: github.event_name == 'pull_request' | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ env.PYTHON_VERSION }} | ||
| cache: 'pip' | ||
| - name: Install dependencies | ||
| run: | | ||
| pip install --upgrade pip | ||
| pip install -e ".[dev,evaluation]" | ||
| - name: Check quality thresholds | ||
| run: | | ||
| python -c " | ||
| import sys | ||
| # Define quality thresholds | ||
| THRESHOLDS = { | ||
| 'relevancy_score': 0.7, | ||
| 'hallucination_risk': 0.3, # Must be BELOW this | ||
| 'reasoning_depth': 0.6, | ||
| 'uix_compliance': 0.6 | ||
| } | ||
| print('=== CYNEPIC Quality Gate ===') | ||
| print('Checking LLM output quality thresholds:') | ||
| for metric, threshold in THRESHOLDS.items(): | ||
| operator = '<=' if 'risk' in metric else '>=' | ||
| print(f' {metric}: {operator} {threshold}') | ||
| print() | ||
| print('Quality gate check: PASSED') | ||
| print('Note: Actual threshold validation happens in test assertions') | ||
| sys.exit(0) | ||
| " | ||