Skip to content

PR Scraper Cron

PR Scraper Cron #2152

Workflow file for this run

name: PR Scraper Cron
on:
schedule:
# Run every 12 minutes during business hours (Mon-Fri, 6am-6pm PT)
# PT is UTC-8 (winter) / UTC-7 (summer), so 6am-6pm PT = 13:00-02:00 UTC (winter)
# Using 13:00-01:59 UTC to cover 6am-6pm PT year-round
# 12 min is max safe frequency: ~800 API calls/run × 5 runs/hour = 4,000 (limit: 5,000/hour)
- cron: '*/12 13-23,0-1 * * 1-5'
workflow_dispatch: # Allow manual trigger
jobs:
scrape:
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
# Step 0: Wake up the Render server (free tier sleeps after inactivity)
- name: Wake up server
run: |
echo "Pinging server to wake it up..."
for i in 1 2 3 4 5; do
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${{ secrets.API_URL }}/api/v1/reviews/version" 2>/dev/null) || true
if [ "$response" = "200" ]; then
echo "Server is up (attempt $i)"
break
fi
echo "Attempt $i: server not ready (status: ${response:-timeout}), waiting 15s..."
sleep 15
done
# Final check
response=$(curl -s --max-time 60 "${{ secrets.API_URL }}/api/v1/reviews/version") || true
if [ -z "$response" ]; then
echo "Warning: Server may not be fully awake yet, proceeding anyway"
else
echo "Server responded: $response"
fi
env:
API_URL: ${{ secrets.API_URL }}
# Step 1: Fetch basic PR info for all repos (low memory)
- name: Step 1/4 - Fetch PR info (lite mode)
run: |
REPOS="vets-api vets-api-mockdata platform-atlas"
OWNER="department-of-veterans-affairs"
step_failed=false
for repo in $REPOS; do
echo "Step 1/4: Fetching basic PR info for $repo at $(date -u '+%Y-%m-%d %H:%M:%S UTC')..."
response=$(curl -s -w "\n%{http_code}" -X POST "${{ secrets.API_URL }}/api/v1/admin/manual_scraper_run?token=${{ secrets.ADMIN_TOKEN }}&repository_name=${repo}&repository_owner=${OWNER}&lite_mode=true" \
-H "Content-Type: application/json" \
--max-time 300) || true
http_code=$(echo "$response" | tail -n1)
body=$(echo "$response" | sed '$d')
echo "Response ($repo): $body"
echo "HTTP Status ($repo): $http_code"
if [ -z "$response" ]; then
echo "Error: Step 1 timed out for $repo"
if [ "$repo" = "vets-api" ]; then
step_failed=true
fi
elif [ "$http_code" -ge 400 ]; then
echo "Error: Step 1 failed for $repo with status $http_code"
if [ "$repo" = "vets-api" ]; then
step_failed=true
fi
else
echo "Step 1/4 completed for $repo"
fi
done
if [ "$step_failed" = true ]; then
echo "Error: Step 1 failed for primary repo (vets-api)"
exit 1
fi
echo "Step 1/4 completed for all repos"
env:
API_URL: ${{ secrets.API_URL }}
ADMIN_TOKEN: ${{ secrets.ADMIN_TOKEN }}
# Step 2: Fetch reviews for all repos (medium memory)
- name: Step 2/4 - Fetch reviews
run: |
REPOS="vets-api vets-api-mockdata platform-atlas"
OWNER="department-of-veterans-affairs"
for repo in $REPOS; do
echo "Step 2/4: Fetching reviews for $repo at $(date -u '+%Y-%m-%d %H:%M:%S UTC')..."
response=$(curl -s -w "\n%{http_code}" -X POST "${{ secrets.API_URL }}/api/v1/admin/fetch_reviews?token=${{ secrets.ADMIN_TOKEN }}&repository_name=${repo}&repository_owner=${OWNER}" \
-H "Content-Type: application/json" \
--max-time 540) || true
http_code=$(echo "$response" | tail -n1)
body=$(echo "$response" | sed '$d')
echo "Response ($repo): $body"
echo "HTTP Status ($repo): $http_code"
if [ -z "$response" ]; then
echo "Warning: Step 2 timed out or failed for $repo (continuing anyway)"
elif [ "$http_code" -ge 400 ]; then
echo "Warning: Step 2 failed for $repo with status $http_code (continuing anyway)"
else
echo "Step 2/4 completed for $repo"
fi
done
echo "Step 2/4 completed for all repos"
env:
API_URL: ${{ secrets.API_URL }}
ADMIN_TOKEN: ${{ secrets.ADMIN_TOKEN }}
# Step 3: Fetch CI checks for all repos (medium memory)
- name: Step 3/4 - Fetch CI checks
run: |
REPOS="vets-api vets-api-mockdata platform-atlas"
OWNER="department-of-veterans-affairs"
for repo in $REPOS; do
echo "Step 3/4: Fetching CI checks for $repo at $(date -u '+%Y-%m-%d %H:%M:%S UTC')..."
response=$(curl -s -w "\n%{http_code}" -X POST "${{ secrets.API_URL }}/api/v1/admin/fetch_ci_checks?token=${{ secrets.ADMIN_TOKEN }}&repository_name=${repo}&repository_owner=${OWNER}" \
-H "Content-Type: application/json" \
--max-time 480) || true
http_code=$(echo "$response" | tail -n1)
body=$(echo "$response" | sed '$d')
echo "Response ($repo): $body"
echo "HTTP Status ($repo): $http_code"
if [ -z "$response" ]; then
echo "Warning: Step 3 timed out or failed for $repo (continuing anyway)"
elif [ "$http_code" -ge 400 ]; then
echo "Warning: Step 3 failed for $repo with status $http_code (continuing anyway)"
else
echo "Step 3/4 completed for $repo"
fi
done
echo "Step 3/4 completed for all repos"
env:
API_URL: ${{ secrets.API_URL }}
ADMIN_TOKEN: ${{ secrets.ADMIN_TOKEN }}
# Step 4: Cleanup merged/closed PRs
- name: Step 4/4 - Cleanup merged/closed PRs
run: |
echo "Step 4/4: Cleaning up merged/closed PRs at $(date -u '+%Y-%m-%d %H:%M:%S UTC')..."
response=$(curl -s -w "\n%{http_code}" -X POST "${{ secrets.API_URL }}/api/v1/admin/cleanup_merged_prs?token=${{ secrets.ADMIN_TOKEN }}" \
-H "Content-Type: application/json" \
--max-time 300) || true
http_code=$(echo "$response" | tail -n1)
body=$(echo "$response" | sed '$d')
echo "Response: $body"
echo "HTTP Status: $http_code"
if [ -z "$response" ]; then
echo "Warning: Step 4 timed out or failed (continuing anyway)"
elif [ "$http_code" -ge 400 ]; then
echo "Warning: Step 4 failed with status $http_code (continuing anyway)"
else
echo "Step 4/4 completed successfully"
fi
env:
API_URL: ${{ secrets.API_URL }}
ADMIN_TOKEN: ${{ secrets.ADMIN_TOKEN }}