Skip to content

Content Sync

Content Sync #427

Workflow file for this run

name: Content Sync
on:
schedule:
- cron: '0 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 * * *' # Hourly 06-22 CET → landesverbaende only
- cron: '0 2 * * *' # Daily 03:00 CET → all sources
workflow_dispatch:
inputs:
source:
description: 'Source group (empty = all). Options: landesverbaende, gruenblog, gruene-at, kommunalwiki, boell-stiftung, bundestag'
required: false
type: string
dry_run:
description: 'Preview without storing'
type: boolean
default: false
force:
description: 'Force re-index even if unchanged'
type: boolean
default: false
permissions:
contents: write
pull-requests: write
concurrency:
group: content-sync-${{ github.event.schedule || 'manual' }}
cancel-in-progress: false
jobs:
# ─── Determine execution strategy ──────────────────────────────
setup:
name: Setup
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
use_matrix: ${{ steps.set-matrix.outputs.use_matrix }}
steps:
- id: set-matrix
env:
SCHEDULE: ${{ github.event.schedule }}
SOURCE: ${{ inputs.source }}
run: |
if [ -n "$SOURCE" ]; then
# Manual single-source run
echo 'use_matrix=false' >> $GITHUB_OUTPUT
echo "matrix={\"source\":[\"$SOURCE\"]}" >> $GITHUB_OUTPUT
elif [ "$SCHEDULE" = "0 2 * * *" ] || [ -z "$SCHEDULE" ]; then
# Daily 3 AM CET or manual dispatch (no source) → all sources
echo 'use_matrix=true' >> $GITHUB_OUTPUT
echo 'matrix={"source":["landesverbaende","gruenblog","gruene-at","kommunalwiki","boell-stiftung","bundestag"]}' >> $GITHUB_OUTPUT
else
# Hourly schedule → landesverbaende only
echo 'use_matrix=false' >> $GITHUB_OUTPUT
echo 'matrix={"source":["landesverbaende"]}' >> $GITHUB_OUTPUT
fi
# ─── Matrix: one job per source group ──────────────────────────
sync:
name: Sync ${{ matrix.source }}
needs: setup
if: needs.setup.outputs.use_matrix == 'true'
runs-on: ubuntu-latest
timeout-minutes: 60
strategy:
matrix: ${{ fromJSON(needs.setup.outputs.matrix) }}
fail-fast: false
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
- uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'pnpm'
- run: pnpm install --frozen-lockfile --filter @gruenerator/api...
- name: Run content sync
id: sync
continue-on-error: true
run: |
ARGS="--source ${{ matrix.source }} --no-email"
if [ "${{ inputs.dry_run }}" = "true" ]; then ARGS="$ARGS --dry-run"; fi
if [ "${{ inputs.force }}" = "true" ]; then ARGS="$ARGS --force"; fi
echo "Running: npx tsx apps/api/update-all-content.ts $ARGS"
npx tsx apps/api/update-all-content.ts $ARGS
env:
QDRANT_URL: ${{ secrets.QDRANT_URL }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
QDRANT_BASIC_AUTH_USERNAME: ${{ secrets.QDRANT_BASIC_AUTH_USERNAME }}
QDRANT_BASIC_AUTH_PASSWORD: ${{ secrets.QDRANT_BASIC_AUTH_PASSWORD }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
APIFY_TOKEN: ${{ secrets.APIFY_TOKEN }}
BREVO_SMTP_HOST: ${{ secrets.BREVO_SMTP_HOST }}
BREVO_SMTP_PORT: ${{ secrets.BREVO_SMTP_PORT }}
BREVO_SMTP_USER: ${{ secrets.BREVO_SMTP_USER }}
BREVO_SMTP_PASS: ${{ secrets.BREVO_SMTP_PASS }}
EMAIL_FROM: ${{ vars.EMAIL_FROM || 'Grünerator <info@gruenerator.eu>' }}
SYNC_SUMMARY_PATH: ${{ github.workspace }}/sync-summary.json
- name: Upload summary
if: always()
uses: actions/upload-artifact@v4
with:
name: sync-summary-${{ matrix.source }}
path: sync-summary.json
if-no-files-found: warn
retention-days: 7
- name: Fail if sync had errors
if: steps.sync.outcome == 'failure'
run: exit 1
# ─── Aggregate matrix results ──────────────────────────────────
aggregate:
name: Aggregate & Report
needs: [setup, sync]
if: always() && needs.setup.outputs.use_matrix == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
- uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'pnpm'
- run: pnpm install --frozen-lockfile --filter @gruenerator/api...
- name: Download all summaries
uses: actions/download-artifact@v4
with:
path: summaries/
pattern: sync-summary-*
merge-multiple: true
- name: Aggregate summaries
run: npx tsx apps/api/aggregate-sync-summaries.ts --dir summaries/
env:
BREVO_SMTP_HOST: ${{ secrets.BREVO_SMTP_HOST }}
BREVO_SMTP_PORT: ${{ secrets.BREVO_SMTP_PORT }}
BREVO_SMTP_USER: ${{ secrets.BREVO_SMTP_USER }}
BREVO_SMTP_PASS: ${{ secrets.BREVO_SMTP_PASS }}
EMAIL_FROM: ${{ vars.EMAIL_FROM || 'Grünerator <info@gruenerator.eu>' }}
CONTENT_SYNC_EMAIL: ${{ vars.CONTENT_SYNC_EMAIL }}
SYNC_SUMMARY_PATH: ${{ github.workspace }}/sync-summary.json
- name: Generate job summary
if: always()
run: |
SUMMARY_FILE="${{ github.workspace }}/sync-summary.json"
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
if [ ! -f "$SUMMARY_FILE" ]; then
echo "## ❌ Content Sync — No Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "The sync script did not produce a summary file. Check the [workflow logs]($RUN_URL)." >> $GITHUB_STEP_SUMMARY
exit 0
fi
# Parse JSON summary
TIMESTAMP=$(jq -r '.timestamp' "$SUMMARY_FILE")
DRY_RUN=$(jq -r '.dryRun' "$SUMMARY_FILE")
FORCE=$(jq -r '.force' "$SUMMARY_FILE")
TOTAL_SOURCES=$(jq -r '.totals.sources' "$SUMMARY_FILE")
SUCCEEDED=$(jq -r '.totals.succeeded' "$SUMMARY_FILE")
FAILED=$(jq -r '.totals.failed' "$SUMMARY_FILE")
STORED=$(jq -r '.totals.stored' "$SUMMARY_FILE")
UPDATED=$(jq -r '.totals.updated' "$SUMMARY_FILE")
SKIPPED=$(jq -r '.totals.skipped' "$SUMMARY_FILE")
FETCH_ERRORS=$(jq -r '.totals.fetchErrors // 0' "$SUMMARY_FILE")
ERRORS=$(jq -r '.totals.errors' "$SUMMARY_FILE")
DURATION=$(jq -r '.totalDuration' "$SUMMARY_FILE")
# Determine status
if [ "$FAILED" -gt 0 ] || [ "$ERRORS" -gt 0 ]; then
STATUS_ICON="⚠️"
else
STATUS_ICON="✅"
fi
if [ "$DRY_RUN" = "true" ]; then
TITLE="$STATUS_ICON Content Sync — Dry Run"
else
TITLE="$STATUS_ICON Content Sync Report"
fi
# Build summary
{
echo "## $TITLE"
echo ""
echo "**Date:** $(date -d "$TIMESTAMP" '+%d.%m.%Y %H:%M UTC' 2>/dev/null || echo "$TIMESTAMP")"
echo "**Duration:** ${DURATION}s"
echo "**Run:** [${{ github.run_id }}]($RUN_URL)"
if [ "$FORCE" = "true" ]; then echo "**Mode:** Force re-index"; fi
echo ""
echo "### Totals"
echo ""
echo "| Metric | Count |"
echo "|--------|------:|"
echo "| Sources | $TOTAL_SOURCES ($SUCCEEDED ok, $FAILED failed) |"
echo "| New documents | $STORED |"
echo "| Updated | $UPDATED |"
echo "| Skipped (unchanged) | $SKIPPED |"
if [ "$FETCH_ERRORS" -gt 0 ]; then echo "| Unreachable pages | $FETCH_ERRORS |"; fi
if [ "$ERRORS" -gt 0 ]; then echo "| **Errors** | **$ERRORS** |"; fi
echo ""
echo "### Per Source"
echo ""
echo "| Source | Status | New | Updated | Skipped | Unreachable | Errors | Duration |"
echo "|--------|--------|----:|--------:|--------:|------------:|-------:|---------:|"
jq -r '.sources[] | "| \(.name) | \(if .status == "success" then "✅" else "❌" end) | \(.stored) | \(.updated) | \(.skipped) | \(.fetchErrors // 0) | \(.errors) | \(.duration)s |"' "$SUMMARY_FILE"
echo ""
# Show failed sources with error messages
FAILED_SOURCES=$(jq -r '.sources[] | select(.status == "failed") | "- **\(.name):** \(.error)"' "$SUMMARY_FILE")
if [ -n "$FAILED_SOURCES" ]; then
echo "### ❌ Failed Sources"
echo ""
echo "$FAILED_SOURCES"
echo ""
fi
# Show sources with new content
NEW_CONTENT=$(jq -r '.sources[] | select(.stored > 0) | "- **\(.name):** +\(.stored) new documents"' "$SUMMARY_FILE")
if [ -n "$NEW_CONTENT" ]; then
echo "### 📥 New Content"
echo ""
echo "$NEW_CONTENT"
echo ""
fi
} >> $GITHUB_STEP_SUMMARY
- name: Update content stats page
if: always()
run: npx tsx apps/api/generate-content-stats.ts
env:
QDRANT_URL: ${{ secrets.QDRANT_URL }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
QDRANT_BASIC_AUTH_USERNAME: ${{ secrets.QDRANT_BASIC_AUTH_USERNAME }}
QDRANT_BASIC_AUTH_PASSWORD: ${{ secrets.QDRANT_BASIC_AUTH_PASSWORD }}
- name: Create PR for stats page update
if: always()
uses: peter-evans/create-pull-request@v7
with:
commit-message: 'docs: update content stats [skip ci]'
title: 'docs: update content stats'
body: 'Automated content stats update from Content Sync workflow.'
branch: automated/content-stats-update
add-paths: documentation/docs/ueber-den-gruenerator/inhaltsdatenbank.md
delete-branch: true
labels: automated
# ─── Single-source run (no matrix overhead) ────────────────────
sync-single:
name: Sync ${{ inputs.source }}
needs: setup
if: needs.setup.outputs.use_matrix == 'false'
runs-on: ubuntu-latest
timeout-minutes: 120
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
- uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'pnpm'
- run: pnpm install --frozen-lockfile --filter @gruenerator/api...
- name: Run content sync
id: sync
continue-on-error: true
run: |
ARGS="--source ${{ inputs.source || 'landesverbaende' }}"
if [ "${{ inputs.dry_run }}" = "true" ]; then ARGS="$ARGS --dry-run"; fi
if [ "${{ inputs.force }}" = "true" ]; then ARGS="$ARGS --force"; fi
echo "Running: npx tsx apps/api/update-all-content.ts $ARGS"
npx tsx apps/api/update-all-content.ts $ARGS
env:
QDRANT_URL: ${{ secrets.QDRANT_URL }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
QDRANT_BASIC_AUTH_USERNAME: ${{ secrets.QDRANT_BASIC_AUTH_USERNAME }}
QDRANT_BASIC_AUTH_PASSWORD: ${{ secrets.QDRANT_BASIC_AUTH_PASSWORD }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
APIFY_TOKEN: ${{ secrets.APIFY_TOKEN }}
BREVO_SMTP_HOST: ${{ secrets.BREVO_SMTP_HOST }}
BREVO_SMTP_PORT: ${{ secrets.BREVO_SMTP_PORT }}
BREVO_SMTP_USER: ${{ secrets.BREVO_SMTP_USER }}
BREVO_SMTP_PASS: ${{ secrets.BREVO_SMTP_PASS }}
EMAIL_FROM: ${{ vars.EMAIL_FROM || 'Grünerator <info@gruenerator.eu>' }}
CONTENT_SYNC_EMAIL: ${{ vars.CONTENT_SYNC_EMAIL }}
SYNC_SUMMARY_PATH: ${{ github.workspace }}/sync-summary.json
- name: Generate job summary
if: always()
run: |
SUMMARY_FILE="${{ github.workspace }}/sync-summary.json"
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
if [ ! -f "$SUMMARY_FILE" ]; then
echo "## ❌ Content Sync — No Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "The sync script did not produce a summary file. Check the [workflow logs]($RUN_URL)." >> $GITHUB_STEP_SUMMARY
exit 0
fi
TIMESTAMP=$(jq -r '.timestamp' "$SUMMARY_FILE")
DRY_RUN=$(jq -r '.dryRun' "$SUMMARY_FILE")
STORED=$(jq -r '.totals.stored' "$SUMMARY_FILE")
ERRORS=$(jq -r '.totals.errors' "$SUMMARY_FILE")
DURATION=$(jq -r '.totalDuration' "$SUMMARY_FILE")
if [ "$ERRORS" -gt 0 ]; then STATUS_ICON="⚠️"; else STATUS_ICON="✅"; fi
if [ "$DRY_RUN" = "true" ]; then TITLE="$STATUS_ICON ${{ inputs.source }} — Dry Run"; else TITLE="$STATUS_ICON ${{ inputs.source }}"; fi
{
echo "## $TITLE"
echo ""
echo "**Date:** $(date -d "$TIMESTAMP" '+%d.%m.%Y %H:%M UTC' 2>/dev/null || echo "$TIMESTAMP")"
echo "**Duration:** ${DURATION}s | **New:** +${STORED} | **Errors:** ${ERRORS}"
} >> $GITHUB_STEP_SUMMARY
- name: Update content stats page
if: always()
run: npx tsx apps/api/generate-content-stats.ts
env:
QDRANT_URL: ${{ secrets.QDRANT_URL }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
QDRANT_BASIC_AUTH_USERNAME: ${{ secrets.QDRANT_BASIC_AUTH_USERNAME }}
QDRANT_BASIC_AUTH_PASSWORD: ${{ secrets.QDRANT_BASIC_AUTH_PASSWORD }}
- name: Create PR for stats page update
if: always()
uses: peter-evans/create-pull-request@v7
with:
commit-message: 'docs: update content stats [skip ci]'
title: 'docs: update content stats'
body: 'Automated content stats update from Content Sync workflow.'
branch: automated/content-stats-update
add-paths: documentation/docs/ueber-den-gruenerator/inhaltsdatenbank.md
delete-branch: true
labels: automated
- name: Fail if sync had errors
if: steps.sync.outcome == 'failure'
run: exit 1