Skip to content

Commit 5601300

Browse files
committed
Monitor analytics for errors
1 parent 425bf6c commit 5601300

File tree

3 files changed

+166
-5
lines changed

3 files changed

+166
-5
lines changed

api/generate_reports_job.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
This script is executed by Cloud Scheduler via Cloud Run Jobs (daily at midnight).
55
"""
66

7+
import logging
8+
import os
79
import sys
810
from pathlib import Path
911

@@ -12,7 +14,15 @@
1214
PROJECT_ROOT = API_DIR.parent
1315
sys.path.insert(0, str(PROJECT_ROOT))
1416

15-
from lib.analytics import generate_all_reports, init_db
17+
from lib.analytics import generate_all_reports, init_db, check_database_health
18+
19+
# Configure logging for Cloud Logging
20+
# On Cloud Run, logs at ERROR level will trigger alerts
21+
logging.basicConfig(
22+
level=logging.INFO,
23+
format='%(levelname)s: %(message)s'
24+
)
25+
logger = logging.getLogger(__name__)
1626

1727

1828
def main():
@@ -21,7 +31,37 @@ def main():
2131
print("Starting MuniMetro analytics report generation...")
2232
print("-" * 60)
2333

24-
# Ensure database exists
34+
# Check database health first
35+
health = check_database_health()
36+
is_cloud_run = os.getenv('CLOUD_RUN') is not None
37+
38+
print(f"Database health check:")
39+
print(f" exists: {health['exists']}")
40+
if is_cloud_run:
41+
print(f" restored_from_gcs: {health['restored_from_gcs']}")
42+
print(f" has_data: {health['has_data']}")
43+
print(f" check_count: {health['check_count']}")
44+
if health['error']:
45+
print(f" error: {health['error']}")
46+
47+
# Check for critical issues that should trigger alerts
48+
if is_cloud_run and not health['restored_from_gcs'] and not health['exists']:
49+
# On Cloud Run with no database - this is a problem
50+
logger.error(
51+
"ANALYTICS_NO_DATABASE: No analytics database found. "
52+
"Database was not restored from GCS and does not exist locally. "
53+
"This may indicate a GCS backup issue or first-time deployment."
54+
)
55+
# Continue to generate empty reports, but log the error
56+
57+
if not health['has_data']:
58+
if health['check_count'] == 0:
59+
logger.warning(
60+
"ANALYTICS_NO_DATA: Analytics database has no status check records. "
61+
"Reports will be empty until status checks are logged."
62+
)
63+
64+
# Ensure database schema exists
2565
init_db()
2666

2767
# Generate reports for all time periods
@@ -40,10 +80,12 @@ def main():
4080
print("\n✓ All reports generated successfully")
4181
sys.exit(0)
4282
else:
83+
logger.error("ANALYTICS_REPORT_FAILED: Some reports failed to generate")
4384
print("\n⚠️ Some reports failed to generate", file=sys.stderr)
4485
sys.exit(1)
4586

4687
except Exception as e:
88+
logger.error(f"ANALYTICS_ERROR: Exception during report generation: {e}")
4789
print(f"\n❌ Error during report generation: {e}", file=sys.stderr)
4890
import traceback
4991
traceback.print_exc()

deploy/cloud/setup-monitoring.sh

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ PROJECT_ID="${GCP_PROJECT_ID:-munimetro}"
99
REGION="${GCP_REGION:-us-west1}"
1010
API_SERVICE="munimetro-api"
1111
CHECKER_JOB="munimetro-checker"
12+
REPORTS_JOB="munimetro-reports"
1213
ALERT_EMAIL="${ALERT_EMAIL:-}"
1314

1415
echo "=========================================="
@@ -256,7 +257,64 @@ curl -s -X POST \
256257
-d "$JOB_ALERT_JSON" \
257258
"https://monitoring.googleapis.com/v3/projects/$PROJECT_ID/alertPolicies" > /dev/null
258259
rm /tmp/alert-job-failures.json
259-
echo "✓ Job failure alert policy created"
260+
echo "✓ Checker job failure alert policy created"
261+
262+
# Create alert policy for Reports job failures (includes missing database errors)
263+
cat > /tmp/alert-reports-failures.json <<EOF
264+
{
265+
"displayName": "MuniMetro Reports Job Failures",
266+
"conditions": [{
267+
"displayName": "Reports job failing",
268+
"conditionThreshold": {
269+
"filter": "resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$REPORTS_JOB\" AND metric.type=\"run.googleapis.com/job/completed_execution_count\" AND metric.labels.result=\"failed\"",
270+
"comparison": "COMPARISON_GT",
271+
"thresholdValue": 0,
272+
"duration": "0s",
273+
"aggregations": [{
274+
"alignmentPeriod": "86400s",
275+
"perSeriesAligner": "ALIGN_DELTA"
276+
}]
277+
}
278+
}],
279+
"alertStrategy": {
280+
"autoClose": "86400s"
281+
},
282+
"combiner": "OR",
283+
"enabled": true,
284+
"notificationChannels": ["$CHANNEL_ID"],
285+
"documentation": {
286+
"content": "The MuniMetro analytics reports job has failed. This may indicate a missing or corrupted analytics database. Check Cloud Run logs for ANALYTICS_NO_DATABASE or ANALYTICS_ERROR messages.",
287+
"mimeType": "text/markdown"
288+
}
289+
}
290+
EOF
291+
292+
EXISTING_REPORTS_ALERT=$(echo "$EXISTING_POLICIES" | python3 -c "
293+
import sys, json
294+
try:
295+
data = json.load(sys.stdin)
296+
for policy in data.get('alertPolicies', []):
297+
if policy.get('displayName') == 'MuniMetro Reports Job Failures':
298+
print(policy['name'])
299+
sys.exit(0)
300+
except:
301+
pass
302+
" 2>/dev/null)
303+
304+
if [ -n "$EXISTING_REPORTS_ALERT" ]; then
305+
echo "⚠️ Reports job failure alert policy already exists, updating..."
306+
curl -s -X DELETE -H "Authorization: Bearer $ACCESS_TOKEN" \
307+
"https://monitoring.googleapis.com/v3/$EXISTING_REPORTS_ALERT" > /dev/null
308+
fi
309+
310+
REPORTS_ALERT_JSON=$(cat < /tmp/alert-reports-failures.json)
311+
curl -s -X POST \
312+
-H "Authorization: Bearer $ACCESS_TOKEN" \
313+
-H "Content-Type: application/json" \
314+
-d "$REPORTS_ALERT_JSON" \
315+
"https://monitoring.googleapis.com/v3/projects/$PROJECT_ID/alertPolicies" > /dev/null
316+
rm /tmp/alert-reports-failures.json
317+
echo "✓ Reports job failure alert policy created"
260318

261319
# Create alert policy for API error rate
262320
cat > /tmp/alert-error-rate.json <<EOF
@@ -323,8 +381,9 @@ echo "=========================================="
323381
echo ""
324382
echo "Configured alerts:"
325383
echo " 1. API Down - Health check failing for 5+ minutes"
326-
echo " 2. Job Failures - Checker job failing repeatedly"
327-
echo " 3. High Error Rate - API returning >10% errors"
384+
echo " 2. Checker Job Failures - Status checker job failing repeatedly"
385+
echo " 3. Reports Job Failures - Analytics reports job failing (includes missing database)"
386+
echo " 4. High Error Rate - API returning >10% errors"
328387
echo ""
329388
echo "Notifications will be sent to: $ALERT_EMAIL"
330389
echo ""

lib/analytics.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,3 +688,63 @@ def generate_all_reports():
688688
results[days] = {'success': False}
689689

690690
return results
691+
692+
693+
def check_database_health():
694+
"""
695+
Check if the analytics database exists and has data.
696+
697+
Returns:
698+
dict: {
699+
'exists': bool, # Database file exists locally
700+
'restored_from_gcs': bool, # Was restored from GCS (Cloud Run only)
701+
'has_data': bool, # Has any status checks recorded
702+
'check_count': int, # Number of status checks
703+
'error': str or None # Error message if any
704+
}
705+
"""
706+
result = {
707+
'exists': False,
708+
'restored_from_gcs': False,
709+
'has_data': False,
710+
'check_count': 0,
711+
'error': None
712+
}
713+
714+
try:
715+
# On Cloud Run, try to restore from GCS first
716+
if _is_cloud_run():
717+
result['restored_from_gcs'] = restore_db_from_gcs()
718+
719+
# Check if database file exists
720+
result['exists'] = LOCAL_DB_PATH.exists()
721+
722+
if not result['exists']:
723+
return result
724+
725+
# Check if database has any data
726+
conn = get_db_connection()
727+
cursor = conn.cursor()
728+
729+
# Check if tables exist
730+
cursor.execute('''
731+
SELECT name FROM sqlite_master
732+
WHERE type='table' AND name='status_checks'
733+
''')
734+
if not cursor.fetchone():
735+
conn.close()
736+
result['error'] = 'status_checks table does not exist'
737+
return result
738+
739+
# Count status checks
740+
cursor.execute('SELECT COUNT(*) as count FROM status_checks')
741+
count = cursor.fetchone()['count']
742+
conn.close()
743+
744+
result['check_count'] = count
745+
result['has_data'] = count > 0
746+
747+
except Exception as e:
748+
result['error'] = str(e)
749+
750+
return result

0 commit comments

Comments
 (0)