|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +# check_data_gaps.R |
| 4 | +# Comprehensive gap detection and filling system for weather data |
| 5 | +# Identifies missing data and creates targeted collection tasks |
| 6 | + |
| 7 | +library(tidyverse) |
| 8 | +library(lubridate) |
| 9 | +library(data.table) |
| 10 | + |
| 11 | +cat("=== WEATHER DATA GAP ANALYSIS ===\n") |
| 12 | +cat("Analysis started at:", format(Sys.time()), "\n\n") |
| 13 | + |
| 14 | +# Configuration |
| 15 | +ANALYSIS_DATE = Sys.Date() |
| 16 | +HISTORICAL_START = as.Date("2013-01-01") # Expected historical coverage start |
| 17 | +FORECAST_DAYS = 7 # Expected forecast coverage |
| 18 | + |
| 19 | +# Helper function to create date ranges |
| 20 | +create_date_range <- function(start_date, end_date) { |
| 21 | + seq.Date(from = start_date, to = end_date, by = "day") |
| 22 | +} |
| 23 | + |
| 24 | +# === 1. STATION DAILY DATA GAPS === |
| 25 | +cat("1. ANALYZING STATION DAILY DATA GAPS\n") |
| 26 | +cat("=====================================\n") |
| 27 | + |
| 28 | +# Load all available station daily data |
| 29 | +station_files = list.files("data/output", pattern = "daily_station_aggregated_.*\\.csv", full.names = TRUE) |
| 30 | +station_daily_gaps = data.table() |
| 31 | + |
| 32 | +if(length(station_files) > 0) { |
| 33 | + # Combine all station daily files |
| 34 | + all_station_data = rbindlist(lapply(station_files, fread), fill = TRUE) |
| 35 | + all_station_data$date = as.Date(all_station_data$date) |
| 36 | + |
| 37 | + # Get unique stations and expected date range |
| 38 | + unique_stations = unique(all_station_data$idema) |
| 39 | + expected_dates = create_date_range(HISTORICAL_START, ANALYSIS_DATE - 1) |
| 40 | + |
| 41 | + cat("Stations with data:", length(unique_stations), "\n") |
| 42 | + cat("Expected date range:", as.character(min(expected_dates)), "to", as.character(max(expected_dates)), "\n") |
| 43 | + |
| 44 | + # Create complete grid and find gaps |
| 45 | + complete_grid = expand.grid( |
| 46 | + idema = unique_stations, |
| 47 | + date = expected_dates, |
| 48 | + stringsAsFactors = FALSE |
| 49 | + ) %>% as.data.table() |
| 50 | + |
| 51 | + # Identify gaps |
| 52 | + station_daily_gaps = complete_grid[!all_station_data[, .(idema, date)], on = .(idema, date)] |
| 53 | + |
| 54 | + cat("Total expected station-days:", nrow(complete_grid), "\n") |
| 55 | + cat("Available station-days:", nrow(all_station_data), "\n") |
| 56 | + cat("Missing station-days:", nrow(station_daily_gaps), "\n") |
| 57 | + cat("Coverage:", round(100 * nrow(all_station_data) / nrow(complete_grid), 1), "%\n\n") |
| 58 | + |
| 59 | + # Gap summary by station |
| 60 | + gap_by_station = station_daily_gaps[, .N, by = idema][order(-N)] |
| 61 | + cat("Top 10 stations with most missing days:\n") |
| 62 | + print(head(gap_by_station, 10)) |
| 63 | + |
| 64 | +} else { |
| 65 | + cat("No station daily data files found.\n") |
| 66 | +} |
| 67 | + |
| 68 | +cat("\n") |
| 69 | + |
| 70 | +# === 2. MUNICIPAL FORECAST DATA GAPS === |
| 71 | +cat("2. ANALYZING MUNICIPAL FORECAST DATA GAPS\n") |
| 72 | +cat("=========================================\n") |
| 73 | + |
| 74 | +# Load municipal data |
| 75 | +municipal_files = list.files("data/output", pattern = "municipal_aggregated_.*\\.csv", full.names = TRUE) |
| 76 | +municipal_forecast_gaps = data.table() |
| 77 | + |
| 78 | +if(length(municipal_files) > 0) { |
| 79 | + all_municipal_data = rbindlist(lapply(municipal_files, fread), fill = TRUE) |
| 80 | + all_municipal_data$fecha = as.Date(all_municipal_data$fecha) |
| 81 | + |
| 82 | + # Focus on forecast period (recent + future) |
| 83 | + forecast_start = ANALYSIS_DATE - 3 # Recent days |
| 84 | + forecast_end = ANALYSIS_DATE + FORECAST_DAYS |
| 85 | + forecast_dates = create_date_range(forecast_start, forecast_end) |
| 86 | + |
| 87 | + # Expected municipalities (load from input) |
| 88 | + if(file.exists("data/input/municipalities.csv.gz")) { |
| 89 | + all_municipalities = fread("data/input/municipalities.csv.gz") |
| 90 | + unique_municipalities = unique(as.character(all_municipalities$CUMUN)) |
| 91 | + |
| 92 | + cat("Expected municipalities:", length(unique_municipalities), "\n") |
| 93 | + cat("Forecast period:", as.character(min(forecast_dates)), "to", as.character(max(forecast_dates)), "\n") |
| 94 | + |
| 95 | + # Create complete forecast grid |
| 96 | + forecast_grid = expand.grid( |
| 97 | + municipio_code = unique_municipalities, |
| 98 | + fecha = forecast_dates, |
| 99 | + stringsAsFactors = FALSE |
| 100 | + ) %>% as.data.table() |
| 101 | + |
| 102 | + # Identify forecast gaps |
| 103 | + available_forecasts = all_municipal_data[fecha %in% forecast_dates & source == "forecast", .(municipio_code, fecha)] |
| 104 | + municipal_forecast_gaps = forecast_grid[!available_forecasts, on = .(municipio_code, fecha)] |
| 105 | + |
| 106 | + cat("Total expected municipal forecasts:", nrow(forecast_grid), "\n") |
| 107 | + cat("Available forecasts:", nrow(available_forecasts), "\n") |
| 108 | + cat("Missing forecasts:", nrow(municipal_forecast_gaps), "\n") |
| 109 | + cat("Forecast coverage:", round(100 * nrow(available_forecasts) / nrow(forecast_grid), 1), "%\n\n") |
| 110 | + |
| 111 | + # Gap summary by municipality |
| 112 | + gap_by_municipality = municipal_forecast_gaps[, .N, by = municipio_code][order(-N)] |
| 113 | + cat("Top 10 municipalities with most missing forecasts:\n") |
| 114 | + print(head(gap_by_municipality, 10)) |
| 115 | + |
| 116 | + } else { |
| 117 | + cat("Municipality list not found.\n") |
| 118 | + } |
| 119 | +} else { |
| 120 | + cat("No municipal data files found.\n") |
| 121 | +} |
| 122 | + |
| 123 | +cat("\n") |
| 124 | + |
| 125 | +# === 3. HOURLY DATA CONTINUITY === |
| 126 | +cat("3. ANALYZING HOURLY DATA CONTINUITY\n") |
| 127 | +cat("===================================\n") |
| 128 | + |
| 129 | +if(file.exists("data/output/hourly_station_ongoing.csv.gz")) { |
| 130 | + hourly_data = fread("data/output/hourly_station_ongoing.csv.gz") |
| 131 | + hourly_data$fint = as_datetime(hourly_data$fint) |
| 132 | + hourly_data$date = as.Date(hourly_data$fint) |
| 133 | + |
| 134 | + # Check recent continuity (last 30 days) |
| 135 | + recent_start = ANALYSIS_DATE - 30 |
| 136 | + recent_hourly = hourly_data[date >= recent_start] |
| 137 | + |
| 138 | + # Count observations per day |
| 139 | + daily_counts = recent_hourly[, .N, by = date][order(date)] |
| 140 | + |
| 141 | + cat("Recent hourly data (last 30 days):\n") |
| 142 | + cat("Total observations:", nrow(recent_hourly), "\n") |
| 143 | + cat("Date range:", as.character(min(daily_counts$date)), "to", as.character(max(daily_counts$date)), "\n") |
| 144 | + cat("Average observations per day:", round(mean(daily_counts$N), 0), "\n") |
| 145 | + |
| 146 | + # Identify days with low counts (potential gaps) |
| 147 | + low_count_threshold = quantile(daily_counts$N, 0.25) # Bottom quartile |
| 148 | + low_count_days = daily_counts[N < low_count_threshold] |
| 149 | + |
| 150 | + if(nrow(low_count_days) > 0) { |
| 151 | + cat("Days with potentially low observation counts:\n") |
| 152 | + print(head(low_count_days, 10)) |
| 153 | + } |
| 154 | + |
| 155 | +} else { |
| 156 | + cat("No hourly data file found.\n") |
| 157 | +} |
| 158 | + |
| 159 | +cat("\n") |
| 160 | + |
| 161 | +# === 4. GENERATE GAP-FILLING RECOMMENDATIONS === |
| 162 | +cat("4. GAP-FILLING RECOMMENDATIONS\n") |
| 163 | +cat("==============================\n") |
| 164 | + |
| 165 | +recommendations = list() |
| 166 | + |
| 167 | +# Station daily gaps |
| 168 | +if(nrow(station_daily_gaps) > 0) { |
| 169 | + # Prioritize recent gaps and high-value stations |
| 170 | + priority_station_gaps = station_daily_gaps[date >= (ANALYSIS_DATE - 90)] # Last 90 days |
| 171 | + |
| 172 | + if(nrow(priority_station_gaps) > 0) { |
| 173 | + recommendations$station_daily = list( |
| 174 | + priority = "HIGH", |
| 175 | + action = "Collect recent station daily data", |
| 176 | + gaps = nrow(priority_station_gaps), |
| 177 | + command = "Rscript code/get_station_daily_hybrid.R # Focus on recent dates" |
| 178 | + ) |
| 179 | + } |
| 180 | +} |
| 181 | + |
| 182 | +# Municipal forecast gaps |
| 183 | +if(nrow(municipal_forecast_gaps) > 0) { |
| 184 | + # Current and future forecasts are high priority |
| 185 | + current_forecast_gaps = municipal_forecast_gaps[fecha >= ANALYSIS_DATE] |
| 186 | + |
| 187 | + if(nrow(current_forecast_gaps) > 0) { |
| 188 | + recommendations$municipal_forecasts = list( |
| 189 | + priority = "CRITICAL", |
| 190 | + action = "Re-collect municipal forecasts", |
| 191 | + gaps = nrow(current_forecast_gaps), |
| 192 | + command = "Rscript code/get_forecast_data_hybrid.R" |
| 193 | + ) |
| 194 | + } |
| 195 | +} |
| 196 | + |
| 197 | +# Print recommendations |
| 198 | +if(length(recommendations) > 0) { |
| 199 | + for(i in seq_along(recommendations)) { |
| 200 | + rec = recommendations[[i]] |
| 201 | + cat("RECOMMENDATION", i, "- Priority:", rec$priority, "\n") |
| 202 | + cat("Action:", rec$action, "\n") |
| 203 | + cat("Gaps to fill:", rec$gaps, "\n") |
| 204 | + cat("Command:", rec$command, "\n\n") |
| 205 | + } |
| 206 | +} else { |
| 207 | + cat("✅ No immediate gap-filling actions needed.\n\n") |
| 208 | +} |
| 209 | + |
| 210 | +# === 5. SAVE GAP ANALYSIS RESULTS === |
| 211 | +cat("5. SAVING GAP ANALYSIS RESULTS\n") |
| 212 | +cat("==============================\n") |
| 213 | + |
| 214 | +# Create gap analysis summary |
| 215 | +gap_summary = list( |
| 216 | + analysis_date = ANALYSIS_DATE, |
| 217 | + station_daily = list( |
| 218 | + total_gaps = ifelse(exists("station_daily_gaps"), nrow(station_daily_gaps), 0), |
| 219 | + coverage_percent = ifelse(exists("complete_grid") && exists("all_station_data"), |
| 220 | + round(100 * nrow(all_station_data) / nrow(complete_grid), 1), 0) |
| 221 | + ), |
| 222 | + municipal_forecasts = list( |
| 223 | + total_gaps = ifelse(exists("municipal_forecast_gaps"), nrow(municipal_forecast_gaps), 0), |
| 224 | + coverage_percent = ifelse(exists("forecast_grid") && exists("available_forecasts"), |
| 225 | + round(100 * nrow(available_forecasts) / nrow(forecast_grid), 1), 0) |
| 226 | + ), |
| 227 | + hourly_continuity = list( |
| 228 | + recent_observations = ifelse(exists("recent_hourly"), nrow(recent_hourly), 0), |
| 229 | + avg_daily_count = ifelse(exists("daily_counts"), round(mean(daily_counts$N), 0), 0) |
| 230 | + ) |
| 231 | +) |
| 232 | + |
| 233 | +# Save detailed gaps if they exist |
| 234 | +if(nrow(station_daily_gaps) > 0) { |
| 235 | + fwrite(station_daily_gaps, paste0("data/output/gaps_station_daily_", ANALYSIS_DATE, ".csv")) |
| 236 | + cat("Station daily gaps saved to: data/output/gaps_station_daily_", ANALYSIS_DATE, ".csv\n") |
| 237 | +} |
| 238 | + |
| 239 | +if(nrow(municipal_forecast_gaps) > 0) { |
| 240 | + fwrite(municipal_forecast_gaps, paste0("data/output/gaps_municipal_forecasts_", ANALYSIS_DATE, ".csv")) |
| 241 | + cat("Municipal forecast gaps saved to: data/output/gaps_municipal_forecasts_", ANALYSIS_DATE, ".csv\n") |
| 242 | +} |
| 243 | + |
| 244 | +# Save summary as JSON for easy parsing |
| 245 | +jsonlite::write_json(gap_summary, paste0("data/output/gap_analysis_summary_", ANALYSIS_DATE, ".json"), |
| 246 | + pretty = TRUE, auto_unbox = TRUE) |
| 247 | +cat("Gap analysis summary saved to: data/output/gap_analysis_summary_", ANALYSIS_DATE, ".json\n") |
| 248 | + |
| 249 | +cat("\n=== GAP ANALYSIS COMPLETE ===\n") |
| 250 | +cat("Analysis completed at:", format(Sys.time()), "\n") |
0 commit comments