Prune old data #25

Workflow file for this run

	# .github/workflows/prune.yml
	name: Prune old data

	on:
	schedule:
	- cron: '0 3 * * *' # Daily at 03:00 UTC
	workflow_dispatch:
	inputs:
	retention_days:
	description: 'Retention window in days'
	required: false
	default: '14'
	embedding_retention_hours:
	description: 'Article embedding retention window in hours'
	required: false
	default: '48'
	recluster_cache_retention_hours:
	description: 'Recluster decision cache retention window in hours'
	required: false
	default: '48'

	jobs:
	prune:
	runs-on: ubuntu-latest
	steps:
	- name: Prune old articles and orphaned stories
	env:
	DATABASE_URL: ${{ secrets.DATABASE_URL }}
	RETENTION_DAYS: ${{ github.event.inputs.retention_days \|\| '14' }}
	EMBEDDING_RETENTION_HOURS: ${{ github.event.inputs.embedding_retention_hours \|\| '48' }}
	RECLUSTER_CACHE_RETENTION_HOURS: ${{ github.event.inputs.recluster_cache_retention_hours \|\| '48' }}
	R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }}
	R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
	R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
	R2_BUCKET: ${{ secrets.R2_BUCKET }}
	R2_ARCHIVE_PREFIX: archive
	run: \|
	set -euo pipefail

	CUTOFF=$(date -u -d "-${RETENTION_DAYS} days" '+%Y-%m-%d %H:%M:%S')
	ARCHIVE_DATE=$(date -u '+%Y/%m/%d')
	ARCHIVE_TS=$(date -u '+%Y%m%dT%H%M%SZ')
	ARCHIVE_DIR="$RUNNER_TEMP/prune-archive"
	mkdir -p "$ARCHIVE_DIR"

	R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
	: "${R2_ACCOUNT_ID:?Set R2_ACCOUNT_ID secret}"
	: "${R2_ACCESS_KEY_ID:?Set R2_ACCESS_KEY_ID secret}"
	: "${R2_SECRET_ACCESS_KEY:?Set R2_SECRET_ACCESS_KEY secret}"
	: "${R2_BUCKET:?Set R2_BUCKET secret}"

	export AWS_ACCESS_KEY_ID="$R2_ACCESS_KEY_ID"
	export AWS_SECRET_ACCESS_KEY="$R2_SECRET_ACCESS_KEY"
	export AWS_DEFAULT_REGION="auto"

	echo "Pruning articles older than ${CUTOFF} (${RETENTION_DAYS}-day retention)"

	echo "Nulling article embeddings older than ${EMBEDDING_RETENTION_HOURS} hours"
	NULLED_EMBEDDINGS=$(psql "$DATABASE_URL" -t -A -c "
	WITH nulled AS (
	UPDATE articles
	SET embedding = NULL
	WHERE embedding IS NOT NULL
	AND created_at < now() - (${EMBEDDING_RETENTION_HOURS}::text \|\| ' hours')::interval
	RETURNING id
	)
	SELECT count(*) FROM nulled;
	")
	echo "Nulled ${NULLED_EMBEDDINGS} old article embeddings"

	echo "Pruning recluster decision cache entries older than ${RECLUSTER_CACHE_RETENTION_HOURS} hours"
	DELETED_RECLUSTER_CACHE=$(psql "$DATABASE_URL" -t -A -c "
	WITH deleted AS (
	DELETE FROM recluster_decision_cache
	WHERE last_used_at < now() - (${RECLUSTER_CACHE_RETENTION_HOURS}::text \|\| ' hours')::interval
	RETURNING fingerprint
	)
	SELECT count(*) FROM deleted;
	")
	echo "Deleted ${DELETED_RECLUSTER_CACHE} old recluster decision cache entries"

	ARTICLES_ARCHIVE="${ARCHIVE_DIR}/articles_${ARCHIVE_TS}.csv.gz"
	STORIES_ARCHIVE="${ARCHIVE_DIR}/stories_${ARCHIVE_TS}.csv.gz"

	psql "$DATABASE_URL" -v ON_ERROR_STOP=1 -c "COPY (
	SELECT
	id, source_id, url, title, lead, summary, main_event, story_identity,
	article_type, location, entities, topics, category, author, image_url,
	published_at, scraped_at, fingerprint, story_id, created_at
	FROM articles
	WHERE scraped_at < '${CUTOFF}'::timestamptz
	ORDER BY scraped_at, id
	) TO STDOUT WITH CSV HEADER" \| gzip > "$ARTICLES_ARCHIVE"

	psql "$DATABASE_URL" -v ON_ERROR_STOP=1 -c "COPY (
	WITH deleted_articles AS (
	SELECT id, story_id
	FROM articles
	WHERE scraped_at < '${CUTOFF}'::timestamptz
	),
	stories_after_delete AS (
	SELECT s.*
	FROM stories s
	WHERE NOT EXISTS (
	SELECT 1
	FROM articles a
	WHERE a.story_id = s.id
	AND NOT EXISTS (
	SELECT 1 FROM deleted_articles d WHERE d.id = a.id
	)
	)
	)
	SELECT
	id, title, summary, topic, topics, article_count, source_count,
	relevance_score, entities, first_seen_at, updated_at, created_at
	FROM stories_after_delete
	ORDER BY updated_at, id
	) TO STDOUT WITH CSV HEADER" \| gzip > "$STORIES_ARCHIVE"

	ARTICLES_KEY="${R2_ARCHIVE_PREFIX}/articles/${ARCHIVE_DATE}/articles_${ARCHIVE_TS}.csv.gz"
	STORIES_KEY="${R2_ARCHIVE_PREFIX}/stories/${ARCHIVE_DATE}/stories_${ARCHIVE_TS}.csv.gz"

	aws s3 cp "$ARTICLES_ARCHIVE" "s3://${R2_BUCKET}/${ARTICLES_KEY}" --endpoint-url "$R2_ENDPOINT"
	aws s3 cp "$STORIES_ARCHIVE" "s3://${R2_BUCKET}/${STORIES_KEY}" --endpoint-url "$R2_ENDPOINT"

	echo "Archived articles to s3://${R2_BUCKET}/${ARTICLES_KEY}"
	echo "Archived stories to s3://${R2_BUCKET}/${STORIES_KEY}"

	DELETED=$(psql "$DATABASE_URL" -t -A -c "
	WITH deleted AS (
	DELETE FROM articles
	WHERE scraped_at < '${CUTOFF}'::timestamptz
	RETURNING id
	)
	SELECT count(*) FROM deleted;
	")
	echo "Deleted ${DELETED} old articles"

	ORPHANED=$(psql "$DATABASE_URL" -t -A -c "
	WITH orphaned AS (
	DELETE FROM stories s
	WHERE NOT EXISTS (
	SELECT 1 FROM articles a WHERE a.story_id = s.id
	)
	RETURNING id
	)
	SELECT count(*) FROM orphaned;
	")
	echo "Deleted ${ORPHANED} orphaned stories"

	psql "$DATABASE_URL" -c "
	UPDATE stories s SET
	article_count = sub.ac,
	source_count = sub.sc
	FROM (
	SELECT story_id,
	count(*) AS ac,
	count(DISTINCT source_id) AS sc
	FROM articles
	WHERE story_id IS NOT NULL
	GROUP BY story_id
	) sub
	WHERE s.id = sub.story_id
	AND (s.article_count != sub.ac OR s.source_count != sub.sc);
	"
	echo "Updated story counts"
	echo "Prune complete: nulled_embeddings=${NULLED_EMBEDDINGS}, deleted_recluster_cache=${DELETED_RECLUSTER_CACHE}, deleted=${DELETED} articles, orphaned=${ORPHANED} stories"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Prune old data #25

Workflow file

Prune old data #25

Uh oh!

Workflow file for this run