Skip to content

Prune old data

Prune old data #25

Workflow file for this run

# .github/workflows/prune.yml
name: Prune old data
on:
schedule:
- cron: '0 3 * * *' # Daily at 03:00 UTC
workflow_dispatch:
inputs:
retention_days:
description: 'Retention window in days'
required: false
default: '14'
embedding_retention_hours:
description: 'Article embedding retention window in hours'
required: false
default: '48'
recluster_cache_retention_hours:
description: 'Recluster decision cache retention window in hours'
required: false
default: '48'
jobs:
prune:
runs-on: ubuntu-latest
steps:
- name: Prune old articles and orphaned stories
env:
DATABASE_URL: ${{ secrets.DATABASE_URL }}
RETENTION_DAYS: ${{ github.event.inputs.retention_days || '14' }}
EMBEDDING_RETENTION_HOURS: ${{ github.event.inputs.embedding_retention_hours || '48' }}
RECLUSTER_CACHE_RETENTION_HOURS: ${{ github.event.inputs.recluster_cache_retention_hours || '48' }}
R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }}
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
R2_BUCKET: ${{ secrets.R2_BUCKET }}
R2_ARCHIVE_PREFIX: archive
run: |
set -euo pipefail
CUTOFF=$(date -u -d "-${RETENTION_DAYS} days" '+%Y-%m-%d %H:%M:%S')
ARCHIVE_DATE=$(date -u '+%Y/%m/%d')
ARCHIVE_TS=$(date -u '+%Y%m%dT%H%M%SZ')
ARCHIVE_DIR="$RUNNER_TEMP/prune-archive"
mkdir -p "$ARCHIVE_DIR"
R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
: "${R2_ACCOUNT_ID:?Set R2_ACCOUNT_ID secret}"
: "${R2_ACCESS_KEY_ID:?Set R2_ACCESS_KEY_ID secret}"
: "${R2_SECRET_ACCESS_KEY:?Set R2_SECRET_ACCESS_KEY secret}"
: "${R2_BUCKET:?Set R2_BUCKET secret}"
export AWS_ACCESS_KEY_ID="$R2_ACCESS_KEY_ID"
export AWS_SECRET_ACCESS_KEY="$R2_SECRET_ACCESS_KEY"
export AWS_DEFAULT_REGION="auto"
echo "Pruning articles older than ${CUTOFF} (${RETENTION_DAYS}-day retention)"
echo "Nulling article embeddings older than ${EMBEDDING_RETENTION_HOURS} hours"
NULLED_EMBEDDINGS=$(psql "$DATABASE_URL" -t -A -c "
WITH nulled AS (
UPDATE articles
SET embedding = NULL
WHERE embedding IS NOT NULL
AND created_at < now() - (${EMBEDDING_RETENTION_HOURS}::text || ' hours')::interval
RETURNING id
)
SELECT count(*) FROM nulled;
")
echo "Nulled ${NULLED_EMBEDDINGS} old article embeddings"
echo "Pruning recluster decision cache entries older than ${RECLUSTER_CACHE_RETENTION_HOURS} hours"
DELETED_RECLUSTER_CACHE=$(psql "$DATABASE_URL" -t -A -c "
WITH deleted AS (
DELETE FROM recluster_decision_cache
WHERE last_used_at < now() - (${RECLUSTER_CACHE_RETENTION_HOURS}::text || ' hours')::interval
RETURNING fingerprint
)
SELECT count(*) FROM deleted;
")
echo "Deleted ${DELETED_RECLUSTER_CACHE} old recluster decision cache entries"
ARTICLES_ARCHIVE="${ARCHIVE_DIR}/articles_${ARCHIVE_TS}.csv.gz"
STORIES_ARCHIVE="${ARCHIVE_DIR}/stories_${ARCHIVE_TS}.csv.gz"
psql "$DATABASE_URL" -v ON_ERROR_STOP=1 -c "COPY (
SELECT
id, source_id, url, title, lead, summary, main_event, story_identity,
article_type, location, entities, topics, category, author, image_url,
published_at, scraped_at, fingerprint, story_id, created_at
FROM articles
WHERE scraped_at < '${CUTOFF}'::timestamptz
ORDER BY scraped_at, id
) TO STDOUT WITH CSV HEADER" | gzip > "$ARTICLES_ARCHIVE"
psql "$DATABASE_URL" -v ON_ERROR_STOP=1 -c "COPY (
WITH deleted_articles AS (
SELECT id, story_id
FROM articles
WHERE scraped_at < '${CUTOFF}'::timestamptz
),
stories_after_delete AS (
SELECT s.*
FROM stories s
WHERE NOT EXISTS (
SELECT 1
FROM articles a
WHERE a.story_id = s.id
AND NOT EXISTS (
SELECT 1 FROM deleted_articles d WHERE d.id = a.id
)
)
)
SELECT
id, title, summary, topic, topics, article_count, source_count,
relevance_score, entities, first_seen_at, updated_at, created_at
FROM stories_after_delete
ORDER BY updated_at, id
) TO STDOUT WITH CSV HEADER" | gzip > "$STORIES_ARCHIVE"
ARTICLES_KEY="${R2_ARCHIVE_PREFIX}/articles/${ARCHIVE_DATE}/articles_${ARCHIVE_TS}.csv.gz"
STORIES_KEY="${R2_ARCHIVE_PREFIX}/stories/${ARCHIVE_DATE}/stories_${ARCHIVE_TS}.csv.gz"
aws s3 cp "$ARTICLES_ARCHIVE" "s3://${R2_BUCKET}/${ARTICLES_KEY}" --endpoint-url "$R2_ENDPOINT"
aws s3 cp "$STORIES_ARCHIVE" "s3://${R2_BUCKET}/${STORIES_KEY}" --endpoint-url "$R2_ENDPOINT"
echo "Archived articles to s3://${R2_BUCKET}/${ARTICLES_KEY}"
echo "Archived stories to s3://${R2_BUCKET}/${STORIES_KEY}"
DELETED=$(psql "$DATABASE_URL" -t -A -c "
WITH deleted AS (
DELETE FROM articles
WHERE scraped_at < '${CUTOFF}'::timestamptz
RETURNING id
)
SELECT count(*) FROM deleted;
")
echo "Deleted ${DELETED} old articles"
ORPHANED=$(psql "$DATABASE_URL" -t -A -c "
WITH orphaned AS (
DELETE FROM stories s
WHERE NOT EXISTS (
SELECT 1 FROM articles a WHERE a.story_id = s.id
)
RETURNING id
)
SELECT count(*) FROM orphaned;
")
echo "Deleted ${ORPHANED} orphaned stories"
psql "$DATABASE_URL" -c "
UPDATE stories s SET
article_count = sub.ac,
source_count = sub.sc
FROM (
SELECT story_id,
count(*) AS ac,
count(DISTINCT source_id) AS sc
FROM articles
WHERE story_id IS NOT NULL
GROUP BY story_id
) sub
WHERE s.id = sub.story_id
AND (s.article_count != sub.ac OR s.source_count != sub.sc);
"
echo "Updated story counts"
echo "Prune complete: nulled_embeddings=${NULLED_EMBEDDINGS}, deleted_recluster_cache=${DELETED_RECLUSTER_CACHE}, deleted=${DELETED} articles, orphaned=${ORPHANED} stories"