|
| 1 | +# MACHINE_INTEGRATION_SPEC.md |
| 2 | +# Project Integration Specification for mosquito-alert-model-monitor |
| 3 | +# This file contains all information needed to integrate any project with the dashboard |
| 4 | +# Written for AI assistant to understand and implement without errors |
| 5 | + |
| 6 | +## INTEGRATION_METADATA |
| 7 | +```yaml |
| 8 | +dashboard_repo: "mosquito-alert-model-monitor" |
| 9 | +dashboard_path: "$HOME/research/mosquito-alert-model-monitor" |
| 10 | +integration_version: "3.0_bulletproof" |
| 11 | +compatible_with: ["bash", "python", "R", "SLURM"] |
| 12 | +git_conflict_resilient: true |
| 13 | +``` |
| 14 | +
|
| 15 | +## ARCHITECTURE_PRINCIPLES |
| 16 | +
|
| 17 | +### SEPARATION_OF_CONCERNS |
| 18 | +- **Main projects**: Focus on their core mission, fail fast on infrastructure issues |
| 19 | +- **Monitor project**: Handle all dashboard complexity, never fail calling jobs |
| 20 | +- **Status integration**: Simple drop-in calls, always exit 0 |
| 21 | +
|
| 22 | +### ROBUSTNESS_HIERARCHY |
| 23 | +1. **Core data collection NEVER fails due to dashboard issues** |
| 24 | +2. **Status updates are best-effort only** |
| 25 | +3. **Git conflicts handled gracefully without blocking jobs** |
| 26 | +4. **Module loading is required for main scripts, not defensive** |
| 27 | +
|
| 28 | +## CORRECT_INTEGRATION_PATTERN |
| 29 | +
|
| 30 | +### FOR_MAIN_PROJECT_SCRIPTS |
| 31 | +```bash |
| 32 | +# DO THIS: Simple, focused scripts that do their job |
| 33 | +#!/bin/bash |
| 34 | +#SBATCH --job-name=my-job |
| 35 | + |
| 36 | +# Load required modules (MUST succeed or job should fail) |
| 37 | +module load R/4.4.2-gfbf-2024a |
| 38 | +module load cURL/8.7.1-GCCcore-13.3.0 |
| 39 | + |
| 40 | +# Set up job |
| 41 | +JOB_NAME="my-job-name" |
| 42 | +STATUS_SCRIPT="./scripts/update_weather_status.sh" # Drop-in wrapper |
| 43 | +START_TIME=$(date +%s) |
| 44 | + |
| 45 | +# Status updates (never fail) |
| 46 | +$STATUS_SCRIPT "$JOB_NAME" "running" 0 5 |
| 47 | + |
| 48 | +# Do the actual work |
| 49 | +echo "Starting main work..." |
| 50 | +python my_main_script.py |
| 51 | + |
| 52 | +# Final status |
| 53 | +$STATUS_SCRIPT "$JOB_NAME" "completed" $(($(date +%s) - START_TIME)) 100 |
| 54 | +``` |
| 55 | + |
| 56 | +### FOR_PROJECT_STATUS_WRAPPER |
| 57 | +```bash |
| 58 | +#!/bin/bash |
| 59 | +# Drop-in status update script: scripts/update_weather_status.sh |
| 60 | +# This script NEVER fails the calling job |
| 61 | + |
| 62 | +JOB_NAME="${1:-project-unknown}" |
| 63 | +STATUS="${2:-unknown}" |
| 64 | +DURATION="${3:-0}" |
| 65 | +PROGRESS="${4:-0}" |
| 66 | +LOG_MESSAGE="${5:-Job status update}" |
| 67 | + |
| 68 | +# Use the robust monitor script if available |
| 69 | +MONITOR_SCRIPT="$HOME/research/mosquito-alert-model-monitor/scripts/update_job_status.sh" |
| 70 | + |
| 71 | +if [ -f "$MONITOR_SCRIPT" ]; then |
| 72 | + echo "📊 Updating dashboard via monitor project..." |
| 73 | + "$MONITOR_SCRIPT" "$JOB_NAME" "$STATUS" "$DURATION" "$PROGRESS" "$LOG_MESSAGE" |
| 74 | +else |
| 75 | + echo "⚠️ Monitor project not found - skipping dashboard update" |
| 76 | +fi |
| 77 | + |
| 78 | +# ALWAYS exit successfully so calling jobs continue |
| 79 | +exit 0 |
| 80 | +``` |
| 81 | +~/research/mosquito-alert-model-monitor/scripts/update_job_status.sh "PROJECT_JOB_NAME" "running" $ELAPSED_SECONDS $PROGRESS_PERCENT "Current step description" |
| 82 | + |
| 83 | +# Job completion (success) |
| 84 | +~/research/mosquito-alert-model-monitor/scripts/update_job_status.sh "PROJECT_JOB_NAME" "completed" $TOTAL_SECONDS 100 "Job completed successfully" |
| 85 | + |
| 86 | +# Job failure (in error handling) |
| 87 | +~/research/mosquito-alert-model-monitor/scripts/update_job_status.sh "PROJECT_JOB_NAME" "failed" $ELAPSED_SECONDS $PROGRESS_PERCENT "Error: description" |
| 88 | +``` |
| 89 | +
|
| 90 | +### STEP_3_LOG_INTEGRATION |
| 91 | +```bash |
| 92 | +# OPTIONAL: Add at end of main script for log collection |
| 93 | +~/research/mosquito-alert-model-monitor/scripts/collect_logs.sh "" "PROJECT_NAME" "PROJECT_LOG_DIR_PATH" |
| 94 | +``` |
| 95 | + |
| 96 | +### STEP_4_VARIABLES_TO_REPLACE |
| 97 | +```yaml |
| 98 | +replacements_needed: |
| 99 | + PROJECT_JOB_NAME: |
| 100 | + description: "Main identifier for this job in dashboard" |
| 101 | + examples: ["prepare_malert_data", "weather-forecast", "model_training"] |
| 102 | + format: "lowercase_with_underscores_or_hyphens" |
| 103 | + |
| 104 | + PROJECT_NAME: |
| 105 | + description: "Short project identifier" |
| 106 | + examples: ["mosquito_model_data_prep", "weather", "ml_pipeline"] |
| 107 | + |
| 108 | + PROJECT_LOG_DIR_PATH: |
| 109 | + description: "Absolute path to project log directory" |
| 110 | + examples: ["$HOME/research/project_name/logs", "./logs", "/path/to/logs"] |
| 111 | + |
| 112 | + ELAPSED_SECONDS: |
| 113 | + description: "Time since job start" |
| 114 | + calculation: "$(($(date +%s) - $START_TIME))" |
| 115 | + |
| 116 | + PROGRESS_PERCENT: |
| 117 | + description: "Job completion percentage (0-100)" |
| 118 | + examples: [0, 25, 50, 75, 100] |
| 119 | + |
| 120 | + TOTAL_SECONDS: |
| 121 | + description: "Total job duration at completion" |
| 122 | + calculation: "$(($(date +%s) - $START_TIME))" |
| 123 | +``` |
| 124 | +
|
| 125 | +## INTEGRATION_TEMPLATES |
| 126 | +
|
| 127 | +### BASH_SCRIPT_TEMPLATE |
| 128 | +```bash |
| 129 | +#!/bin/bash |
| 130 | +# Add at beginning of main script: |
| 131 | + |
| 132 | +# Dashboard integration setup |
| 133 | +DASHBOARD_SCRIPT="$HOME/research/mosquito-alert-model-monitor/scripts/update_job_status.sh" |
| 134 | +JOB_NAME="PROJECT_JOB_NAME" # REPLACE WITH ACTUAL JOB NAME |
| 135 | +START_TIME=$(date +%s) |
| 136 | + |
| 137 | +# Job start notification |
| 138 | +$DASHBOARD_SCRIPT "$JOB_NAME" "running" 0 0 "Starting PROJECT_DESCRIPTION" |
| 139 | + |
| 140 | +# Add throughout script for progress: |
| 141 | +# $DASHBOARD_SCRIPT "$JOB_NAME" "running" $(($(date +%s) - $START_TIME)) PROGRESS_PERCENT "STEP_DESCRIPTION" |
| 142 | + |
| 143 | +# Example progress calls: |
| 144 | +$DASHBOARD_SCRIPT "$JOB_NAME" "running" $(($(date +%s) - $START_TIME)) 25 "Data loading complete" |
| 145 | +$DASHBOARD_SCRIPT "$JOB_NAME" "running" $(($(date +%s) - $START_TIME)) 50 "Processing data" |
| 146 | +$DASHBOARD_SCRIPT "$JOB_NAME" "running" $(($(date +%s) - $START_TIME)) 75 "Generating outputs" |
| 147 | + |
| 148 | +# At end of script: |
| 149 | +$DASHBOARD_SCRIPT "$JOB_NAME" "completed" $(($(date +%s) - $START_TIME)) 100 "Job completed successfully" |
| 150 | + |
| 151 | +# Optional log collection: |
| 152 | +$HOME/research/mosquito-alert-model-monitor/scripts/collect_logs.sh "" "PROJECT_NAME" "./logs" |
| 153 | +``` |
| 154 | + |
| 155 | +### PYTHON_SCRIPT_TEMPLATE |
| 156 | +```python |
| 157 | +import subprocess |
| 158 | +import time |
| 159 | +import sys |
| 160 | + |
| 161 | +# Dashboard integration |
| 162 | +DASHBOARD_SCRIPT = os.path.expanduser("~/research/mosquito-alert-model-monitor/scripts/update_job_status.sh") |
| 163 | +JOB_NAME = "PROJECT_JOB_NAME" # REPLACE WITH ACTUAL JOB NAME |
| 164 | +start_time = time.time() |
| 165 | + |
| 166 | +def update_status(status, progress, message): |
| 167 | + """Update job status in dashboard""" |
| 168 | + elapsed = int(time.time() - start_time) |
| 169 | + try: |
| 170 | + subprocess.run([DASHBOARD_SCRIPT, JOB_NAME, status, str(elapsed), str(progress), message], |
| 171 | + check=False, capture_output=True) |
| 172 | + except: |
| 173 | + pass # Never fail the main job due to dashboard issues |
| 174 | + |
| 175 | +# Job start |
| 176 | +update_status("running", 0, "Starting PROJECT_DESCRIPTION") |
| 177 | + |
| 178 | +# Progress updates throughout code: |
| 179 | +update_status("running", 25, "Data loading complete") |
| 180 | +update_status("running", 50, "Processing data") |
| 181 | +update_status("running", 75, "Generating outputs") |
| 182 | + |
| 183 | +# Job completion |
| 184 | +update_status("completed", 100, "Job completed successfully") |
| 185 | +``` |
| 186 | + |
| 187 | +### R_SCRIPT_TEMPLATE |
| 188 | +```r |
| 189 | +# Dashboard integration for R scripts |
| 190 | +dashboard_script <- "~/research/mosquito-alert-model-monitor/scripts/update_job_status.sh" |
| 191 | +job_name <- "PROJECT_JOB_NAME" # REPLACE WITH ACTUAL JOB NAME |
| 192 | +start_time <- Sys.time() |
| 193 | + |
| 194 | +update_status <- function(status, progress, message) { |
| 195 | + elapsed <- as.integer(difftime(Sys.time(), start_time, units = "secs")) |
| 196 | + tryCatch({ |
| 197 | + system(paste(dashboard_script, job_name, status, elapsed, progress, shQuote(message)), |
| 198 | + ignore.stdout = TRUE, ignore.stderr = TRUE) |
| 199 | + }, error = function(e) { |
| 200 | + # Never fail the main job due to dashboard issues |
| 201 | + }) |
| 202 | +} |
| 203 | + |
| 204 | +# Job start |
| 205 | +update_status("running", 0, "Starting PROJECT_DESCRIPTION") |
| 206 | + |
| 207 | +# Progress updates throughout code: |
| 208 | +update_status("running", 25, "Data loading complete") |
| 209 | +update_status("running", 50, "Processing data") |
| 210 | +update_status("running", 75, "Generating outputs") |
| 211 | + |
| 212 | +# Job completion |
| 213 | +update_status("completed", 100, "Job completed successfully") |
| 214 | +``` |
| 215 | + |
| 216 | +## SLURM_INTEGRATION |
| 217 | +```bash |
| 218 | +# Add to SLURM script headers: |
| 219 | +#SBATCH --job-name=PROJECT_JOB_NAME |
| 220 | + |
| 221 | +# Add after SLURM setup, before main work: |
| 222 | +DASHBOARD_SCRIPT="$HOME/research/mosquito-alert-model-monitor/scripts/update_job_status.sh" |
| 223 | +JOB_NAME="PROJECT_JOB_NAME" |
| 224 | +START_TIME=$(date +%s) |
| 225 | + |
| 226 | +$DASHBOARD_SCRIPT "$JOB_NAME" "running" 0 0 "SLURM job started (ID: $SLURM_JOB_ID)" |
| 227 | + |
| 228 | +# Add before exit: |
| 229 | +$DASHBOARD_SCRIPT "$JOB_NAME" "completed" $(($(date +%s) - $START_TIME)) 100 "SLURM job completed (ID: $SLURM_JOB_ID)" |
| 230 | +``` |
| 231 | + |
| 232 | +## ERROR_HANDLING_PATTERN |
| 233 | +```bash |
| 234 | +# Robust error handling that doesn't break jobs |
| 235 | +set +e # Don't exit on dashboard errors |
| 236 | + |
| 237 | +# Wrap main job logic |
| 238 | +main_job_function() { |
| 239 | + # Your original job code here |
| 240 | + return $? |
| 241 | +} |
| 242 | + |
| 243 | +# Call main job with error handling |
| 244 | +if main_job_function; then |
| 245 | + $DASHBOARD_SCRIPT "$JOB_NAME" "completed" $(($(date +%s) - $START_TIME)) 100 "Job completed successfully" |
| 246 | +else |
| 247 | + $DASHBOARD_SCRIPT "$JOB_NAME" "failed" $(($(date +%s) - $START_TIME)) 50 "Job failed with error" |
| 248 | + exit 1 |
| 249 | +fi |
| 250 | +``` |
| 251 | + |
| 252 | +## TESTING_CHECKLIST |
| 253 | +```yaml |
| 254 | +verify_integration: |
| 255 | + - status_file_created: "ls -la ~/research/mosquito-alert-model-monitor/data/status/PROJECT_JOB_NAME.json" |
| 256 | + - test_script_manually: "./scripts/test_dashboard_integration.sh" |
| 257 | + - check_dashboard_locally: "open ~/research/mosquito-alert-model-monitor/docs/index.html" |
| 258 | + - verify_no_job_failures: "Run original job and ensure it completes even if dashboard fails" |
| 259 | +``` |
| 260 | +
|
| 261 | +## COMMON_PROJECT_TYPES |
| 262 | +
|
| 263 | +### DAILY_DATA_PROCESSING |
| 264 | +```yaml |
| 265 | +typical_pattern: |
| 266 | + job_name: "project_daily_process" |
| 267 | + schedule: "Daily via cron" |
| 268 | + stages: ["download", "process", "upload", "cleanup"] |
| 269 | + progress_points: [0, 25, 50, 75, 100] |
| 270 | +``` |
| 271 | +
|
| 272 | +### MODEL_TRAINING |
| 273 | +```yaml |
| 274 | +typical_pattern: |
| 275 | + job_name: "model_training" |
| 276 | + schedule: "Weekly/Monthly" |
| 277 | + stages: ["data_prep", "training", "validation", "deployment"] |
| 278 | + progress_points: [0, 20, 60, 90, 100] |
| 279 | +``` |
| 280 | +
|
| 281 | +### DATA_COLLECTION |
| 282 | +```yaml |
| 283 | +typical_pattern: |
| 284 | + job_name: "data_collection" |
| 285 | + schedule: "Hourly/Daily" |
| 286 | + stages: ["fetch", "validate", "store", "backup"] |
| 287 | + progress_points: [0, 30, 70, 100] |
| 288 | +``` |
| 289 | +
|
| 290 | +## IMPLEMENTATION_NOTES |
| 291 | +- All scripts exit with code 0 to prevent job failures |
| 292 | +- Dashboard updates are "best effort" - job success is priority |
| 293 | +- JSON status files use standardized format |
| 294 | +- Log collection is optional and safe |
| 295 | +- Git operations have timeouts and retries |
| 296 | +- SLURM jobs get proper resource allocation |
| 297 | +- No dependencies on external libraries |
0 commit comments