Skip to content

tune otel loki catch-up ingestion #1814

tune otel loki catch-up ingestion

tune otel loki catch-up ingestion #1814

Workflow file for this run

name: Cluster CI
on:
pull_request:
paths:
- infrastructure/**
- monitoring/**
- my-apps/**
- scripts/**
- scripts/validate-argocd-apps.sh
- .github/workflows/cluster-ci.yml
- .github/renovate.json5
push:
branches:
- main
paths:
- infrastructure/**
- monitoring/**
- my-apps/**
- scripts/**
- scripts/validate-argocd-apps.sh
- .github/workflows/cluster-ci.yml
- .github/renovate.json5
permissions:
contents: read
jobs:
argocd-structure:
name: ArgoCD Structure Validation
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Validate ArgoCD app topology
run: bash ./scripts/validate-argocd-apps.sh
truenas-csi-contract:
name: TrueNAS CSI Contract
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Kustomize
uses: imranismail/setup-kustomize@v2
with:
kustomize-version: 5.4.2
- name: Validate official TrueNAS CSI deployment
run: bash ./scripts/validate-truenas-csi.sh
render-and-schema:
name: Kustomize Render and Schema Validation
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Kustomize
uses: imranismail/setup-kustomize@v2
with:
kustomize-version: 5.4.2
- name: Setup Helm
uses: azure/setup-helm@v4
with:
version: v3.16.2
- name: Install kubeconform
run: |
set -euo pipefail
KUBECONFORM_VERSION="v0.6.7"
curl -sSL -o /tmp/kubeconform.tar.gz "https://github.com/yannh/kubeconform/releases/download/${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz"
tar -xzf /tmp/kubeconform.tar.gz -C /tmp
sudo mv /tmp/kubeconform /usr/local/bin/kubeconform
kubeconform -v
- name: Render all kustomizations
run: |
set -euo pipefail
mapfile -t dirs < <(find infrastructure monitoring my-apps -type f -name kustomization.yaml -exec dirname {} \; | sort -u)
if [ "${#dirs[@]}" -eq 0 ]; then
echo "No kustomization directories found."
exit 1
fi
: > /tmp/all-manifests.yaml
for dir in "${dirs[@]}"; do
echo "Rendering ${dir}"
kustomize build "${dir}" --enable-helm >> /tmp/all-manifests.yaml
echo "---" >> /tmp/all-manifests.yaml
done
- name: Validate pvc-plumber restore contract (rendered)
# A managed PVC without dataSourceRef -> <pvc>-dst backs up fine
# but recreates EMPTY during DR. Runs against the rendered stream
# (not raw YAML) so Helm-rendered PVCs are covered — e.g.
# gitea/gitea-shared-storage, which a static grep cannot see.
run: |
set -euo pipefail
python3 -c "import yaml" 2>/dev/null || pip3 install --quiet pyyaml
bash ./scripts/validate-restore-contract.sh /tmp/all-manifests.yaml
- name: Validate Kubernetes schemas
run: |
set -euo pipefail
# Filter known kubeconform false positives:
# - Gitea Helm-rendered Service gitea-http: targetPort triggers a
# oneOf ambiguity in the Kubernetes Service schema.
# - VolSync ReplicationSource and ReplicationDestination:
# Datree's CRD catalog schema is stale for our installed
# VolSync and rejects valid spec.kopia configs on both kinds.
# Both must be skipped — the previous filter skipped only
# ReplicationSource, which let the 28 inline RD resources
# in my-apps/** fail kubeconform with "additionalProperties
# 'kopia' not allowed" on every Cluster CI run.
# Split multi-doc YAML into per-document files, skip those documents.
csplit -z -f /tmp/doc- /tmp/all-manifests.yaml '/^---$/' '{*}' > /dev/null
: > /tmp/filtered-manifests.yaml
for f in /tmp/doc-*; do
if grep -q 'kind: Service' "$f" && grep -q 'name: gitea-http' "$f"; then
continue
fi
if grep -q 'apiVersion: volsync.backube/v1alpha1' "$f" \
&& { grep -q 'kind: ReplicationSource' "$f" || grep -q 'kind: ReplicationDestination' "$f"; }; then
continue
fi
cat "$f" >> /tmp/filtered-manifests.yaml
done
kubeconform \
-summary \
-ignore-missing-schemas \
-schema-location default \
-schema-location 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' \
/tmp/filtered-manifests.yaml
kyverno-policy-safety:
name: Kyverno Policy Safety Check
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check for dangerous Kyverno generate policy settings
run: bash ./scripts/validate-kyverno-policies.sh
backup-exempt-contract:
name: Backup-Exempt Annotation Contract
# Catches the class of bug found 2026-05-19 (and again 2026-06-09 in
# monitoring/ Helm values): a PVC labeled backup-exempt:"true" using
# the bare `backup-exempt-reason` key instead of the fully-qualified
# `storage.vanillax.dev/...` key. pvc-plumber v4 (permissive, no
# admission webhook) classifies that as ExemptMissingReason and parks
# the PVC in /audit as needs-human-review, where it masks real
# findings; any future strict mode would deny it at admission.
# Fail at PR time, not during DR.
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Validate backup-exempt annotation keys
run: bash ./scripts/validate-backup-exempt-keys.sh
otel-collector-validate:
name: OpenTelemetry Collector Config Validation
# Catches the class of bug that caused the 2026-04-20 9-hour
# root-sync jam: a pipeline referencing a receiver that was removed.
# Runs `otelcol validate` on each OpenTelemetryCollector's rendered
# config so pipeline-receiver-exporter mismatches are rejected at
# PR time instead of crashlooping in-cluster.
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Kustomize
uses: imranismail/setup-kustomize@v2
with:
kustomize-version: 5.4.2
- name: Setup Helm
uses: azure/setup-helm@v4
with:
version: v3.16.2
- name: Validate all OpenTelemetryCollector configs
run: bash ./scripts/validate-otel-configs.sh
renovate-config-validate:
name: Renovate Config Validation
# Catches the class of bug that opened issue #1284 on 2026-05-10:
# `packageRules` cannot combine `matchUpdateTypes` and `versioning`,
# but the validator only flags it at runtime — Renovate stops opening
# PRs cluster-wide until fixed. Run validator on every PR that touches
# .github/renovate.json5 so the bad rule never reaches main.
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Node
uses: actions/setup-node@v4
with:
# Renovate >=41 requires Node >=22 and dropped Node 20. On Node 20,
# `npx renovate` silently resolves to an old renovate major that
# still uses `fileMatch`, so it rejects this repo's modern
# `managerFilePatterns` config and fails EVERY renovate PR.
node-version: '22'
- name: Validate Renovate config
run: |
set -euo pipefail
# The validator looks at the filename to decide global vs repo
# config; .github/renovate.json5 is treated as global, which
# surfaces the same packageRules / managers errors we care about.
npx --yes --package=renovate renovate-config-validator --strict .github/renovate.json5
shellcheck:
name: Shell Script Lint (Informational)
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install shellcheck
run: |
set -euo pipefail
sudo apt-get update
sudo apt-get install -y shellcheck
- name: Run shellcheck on scripts
run: |
set -euo pipefail
shellcheck -S warning scripts/*.sh