Skip to content

Commit c6de5c9

Browse files
committed
Add hybrid weather data collection gap filling and monitoring scripts and crontab configuration
- Created CRONTAB_LINES_TO_ADD.txt for scheduling daily, weekly, monthly, and quarterly data collection tasks. - Implemented check_data_gaps.R for comprehensive gap detection and filling in weather data. - Added fill_data_gaps.R to intelligently fill data gaps without redundant historical collection. - Updated collect_all_datasets_hybrid.R to include post-collection gap analysis and data summary updates. - Introduced generate_data_summary.R for generating a comprehensive data summary for README.md. - Added update_readme_with_summary.R to automate README updates with the latest data summary. - Updated crontab_lines_to_add.txt to reflect new scheduling for hybrid data collection. - Enhanced logging for weather_hybrid job outputs and errors.
1 parent c5f229b commit c6de5c9

9 files changed

+971
-3
lines changed

CRONTAB_LINES_TO_ADD.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# WEATHER DATA COLLECTION - HYBRID SYSTEM
2+
# Add these lines to your existing crontab with: crontab -e
3+
4+
# Main hybrid collection (daily at 2 AM) - All three datasets
5+
0 2 * * * cd /home/j.palmer/research/weather-data-collector-spain && sbatch update_weather_hybrid.sh
6+
7+
# Daily README update (6 AM) - Updates README with current status after collection
8+
0 6 * * * cd /home/j.palmer/research/weather-data-collector-spain && Rscript code/update_readme_with_summary.R
9+
10+
# Weekly gap filling (Sundays at 1 AM) - Fills any missing data without redundancy
11+
0 1 * * 0 cd /home/j.palmer/research/weather-data-collector-spain && Rscript code/fill_data_gaps.R
12+
13+
# INSTRUCTIONS:
14+
# 1. Copy the three lines above (without the # comments)
15+
# 2. Run: crontab -e
16+
# 3. Paste the lines at the end of your existing crontab
17+
# 4. Save and exit
18+
#
19+
# This will:
20+
# - Collect all data daily (2 AM)
21+
# - Update README daily (6 AM) - shows near real-time data status
22+
# - Fill gaps weekly without redundant historical collection (1 AM Sundays)
23+
#
24+
# Expected performance:
25+
# - Daily collection: 2-4 hours (vs 33+ hours previously)
26+
# - Daily README update: 1-2 minutes (very lightweight)
27+
# - Gap filling: 30 minutes - 2 hours depending on gaps
28+
#
29+
# Timeline:
30+
# - 1 AM Sunday: Gap filling (if needed)
31+
# - 2 AM daily: Main data collection
32+
# - 6 AM daily: README status update (after collection completes)

code/check_data_gaps.R

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
#!/usr/bin/env Rscript
2+
3+
# check_data_gaps.R
4+
# Comprehensive gap detection and filling system for weather data
5+
# Identifies missing data and creates targeted collection tasks
6+
7+
library(tidyverse)
8+
library(lubridate)
9+
library(data.table)
10+
11+
cat("=== WEATHER DATA GAP ANALYSIS ===\n")
12+
cat("Analysis started at:", format(Sys.time()), "\n\n")
13+
14+
# Configuration
15+
ANALYSIS_DATE = Sys.Date()
16+
HISTORICAL_START = as.Date("2013-01-01") # Expected historical coverage start
17+
FORECAST_DAYS = 7 # Expected forecast coverage
18+
19+
# Helper function to create date ranges
20+
create_date_range <- function(start_date, end_date) {
21+
seq.Date(from = start_date, to = end_date, by = "day")
22+
}
23+
24+
# === 1. STATION DAILY DATA GAPS ===
25+
cat("1. ANALYZING STATION DAILY DATA GAPS\n")
26+
cat("=====================================\n")
27+
28+
# Load all available station daily data
29+
station_files = list.files("data/output", pattern = "daily_station_aggregated_.*\\.csv", full.names = TRUE)
30+
station_daily_gaps = data.table()
31+
32+
if(length(station_files) > 0) {
33+
# Combine all station daily files
34+
all_station_data = rbindlist(lapply(station_files, fread), fill = TRUE)
35+
all_station_data$date = as.Date(all_station_data$date)
36+
37+
# Get unique stations and expected date range
38+
unique_stations = unique(all_station_data$idema)
39+
expected_dates = create_date_range(HISTORICAL_START, ANALYSIS_DATE - 1)
40+
41+
cat("Stations with data:", length(unique_stations), "\n")
42+
cat("Expected date range:", as.character(min(expected_dates)), "to", as.character(max(expected_dates)), "\n")
43+
44+
# Create complete grid and find gaps
45+
complete_grid = expand.grid(
46+
idema = unique_stations,
47+
date = expected_dates,
48+
stringsAsFactors = FALSE
49+
) %>% as.data.table()
50+
51+
# Identify gaps
52+
station_daily_gaps = complete_grid[!all_station_data[, .(idema, date)], on = .(idema, date)]
53+
54+
cat("Total expected station-days:", nrow(complete_grid), "\n")
55+
cat("Available station-days:", nrow(all_station_data), "\n")
56+
cat("Missing station-days:", nrow(station_daily_gaps), "\n")
57+
cat("Coverage:", round(100 * nrow(all_station_data) / nrow(complete_grid), 1), "%\n\n")
58+
59+
# Gap summary by station
60+
gap_by_station = station_daily_gaps[, .N, by = idema][order(-N)]
61+
cat("Top 10 stations with most missing days:\n")
62+
print(head(gap_by_station, 10))
63+
64+
} else {
65+
cat("No station daily data files found.\n")
66+
}
67+
68+
cat("\n")
69+
70+
# === 2. MUNICIPAL FORECAST DATA GAPS ===
71+
cat("2. ANALYZING MUNICIPAL FORECAST DATA GAPS\n")
72+
cat("=========================================\n")
73+
74+
# Load municipal data
75+
municipal_files = list.files("data/output", pattern = "municipal_aggregated_.*\\.csv", full.names = TRUE)
76+
municipal_forecast_gaps = data.table()
77+
78+
if(length(municipal_files) > 0) {
79+
all_municipal_data = rbindlist(lapply(municipal_files, fread), fill = TRUE)
80+
all_municipal_data$fecha = as.Date(all_municipal_data$fecha)
81+
82+
# Focus on forecast period (recent + future)
83+
forecast_start = ANALYSIS_DATE - 3 # Recent days
84+
forecast_end = ANALYSIS_DATE + FORECAST_DAYS
85+
forecast_dates = create_date_range(forecast_start, forecast_end)
86+
87+
# Expected municipalities (load from input)
88+
if(file.exists("data/input/municipalities.csv.gz")) {
89+
all_municipalities = fread("data/input/municipalities.csv.gz")
90+
unique_municipalities = unique(as.character(all_municipalities$CUMUN))
91+
92+
cat("Expected municipalities:", length(unique_municipalities), "\n")
93+
cat("Forecast period:", as.character(min(forecast_dates)), "to", as.character(max(forecast_dates)), "\n")
94+
95+
# Create complete forecast grid
96+
forecast_grid = expand.grid(
97+
municipio_code = unique_municipalities,
98+
fecha = forecast_dates,
99+
stringsAsFactors = FALSE
100+
) %>% as.data.table()
101+
102+
# Identify forecast gaps
103+
available_forecasts = all_municipal_data[fecha %in% forecast_dates & source == "forecast", .(municipio_code, fecha)]
104+
municipal_forecast_gaps = forecast_grid[!available_forecasts, on = .(municipio_code, fecha)]
105+
106+
cat("Total expected municipal forecasts:", nrow(forecast_grid), "\n")
107+
cat("Available forecasts:", nrow(available_forecasts), "\n")
108+
cat("Missing forecasts:", nrow(municipal_forecast_gaps), "\n")
109+
cat("Forecast coverage:", round(100 * nrow(available_forecasts) / nrow(forecast_grid), 1), "%\n\n")
110+
111+
# Gap summary by municipality
112+
gap_by_municipality = municipal_forecast_gaps[, .N, by = municipio_code][order(-N)]
113+
cat("Top 10 municipalities with most missing forecasts:\n")
114+
print(head(gap_by_municipality, 10))
115+
116+
} else {
117+
cat("Municipality list not found.\n")
118+
}
119+
} else {
120+
cat("No municipal data files found.\n")
121+
}
122+
123+
cat("\n")
124+
125+
# === 3. HOURLY DATA CONTINUITY ===
126+
cat("3. ANALYZING HOURLY DATA CONTINUITY\n")
127+
cat("===================================\n")
128+
129+
if(file.exists("data/output/hourly_station_ongoing.csv.gz")) {
130+
hourly_data = fread("data/output/hourly_station_ongoing.csv.gz")
131+
hourly_data$fint = as_datetime(hourly_data$fint)
132+
hourly_data$date = as.Date(hourly_data$fint)
133+
134+
# Check recent continuity (last 30 days)
135+
recent_start = ANALYSIS_DATE - 30
136+
recent_hourly = hourly_data[date >= recent_start]
137+
138+
# Count observations per day
139+
daily_counts = recent_hourly[, .N, by = date][order(date)]
140+
141+
cat("Recent hourly data (last 30 days):\n")
142+
cat("Total observations:", nrow(recent_hourly), "\n")
143+
cat("Date range:", as.character(min(daily_counts$date)), "to", as.character(max(daily_counts$date)), "\n")
144+
cat("Average observations per day:", round(mean(daily_counts$N), 0), "\n")
145+
146+
# Identify days with low counts (potential gaps)
147+
low_count_threshold = quantile(daily_counts$N, 0.25) # Bottom quartile
148+
low_count_days = daily_counts[N < low_count_threshold]
149+
150+
if(nrow(low_count_days) > 0) {
151+
cat("Days with potentially low observation counts:\n")
152+
print(head(low_count_days, 10))
153+
}
154+
155+
} else {
156+
cat("No hourly data file found.\n")
157+
}
158+
159+
cat("\n")
160+
161+
# === 4. GENERATE GAP-FILLING RECOMMENDATIONS ===
162+
cat("4. GAP-FILLING RECOMMENDATIONS\n")
163+
cat("==============================\n")
164+
165+
recommendations = list()
166+
167+
# Station daily gaps
168+
if(nrow(station_daily_gaps) > 0) {
169+
# Prioritize recent gaps and high-value stations
170+
priority_station_gaps = station_daily_gaps[date >= (ANALYSIS_DATE - 90)] # Last 90 days
171+
172+
if(nrow(priority_station_gaps) > 0) {
173+
recommendations$station_daily = list(
174+
priority = "HIGH",
175+
action = "Collect recent station daily data",
176+
gaps = nrow(priority_station_gaps),
177+
command = "Rscript code/get_station_daily_hybrid.R # Focus on recent dates"
178+
)
179+
}
180+
}
181+
182+
# Municipal forecast gaps
183+
if(nrow(municipal_forecast_gaps) > 0) {
184+
# Current and future forecasts are high priority
185+
current_forecast_gaps = municipal_forecast_gaps[fecha >= ANALYSIS_DATE]
186+
187+
if(nrow(current_forecast_gaps) > 0) {
188+
recommendations$municipal_forecasts = list(
189+
priority = "CRITICAL",
190+
action = "Re-collect municipal forecasts",
191+
gaps = nrow(current_forecast_gaps),
192+
command = "Rscript code/get_forecast_data_hybrid.R"
193+
)
194+
}
195+
}
196+
197+
# Print recommendations
198+
if(length(recommendations) > 0) {
199+
for(i in seq_along(recommendations)) {
200+
rec = recommendations[[i]]
201+
cat("RECOMMENDATION", i, "- Priority:", rec$priority, "\n")
202+
cat("Action:", rec$action, "\n")
203+
cat("Gaps to fill:", rec$gaps, "\n")
204+
cat("Command:", rec$command, "\n\n")
205+
}
206+
} else {
207+
cat("✅ No immediate gap-filling actions needed.\n\n")
208+
}
209+
210+
# === 5. SAVE GAP ANALYSIS RESULTS ===
211+
cat("5. SAVING GAP ANALYSIS RESULTS\n")
212+
cat("==============================\n")
213+
214+
# Create gap analysis summary
215+
gap_summary = list(
216+
analysis_date = ANALYSIS_DATE,
217+
station_daily = list(
218+
total_gaps = ifelse(exists("station_daily_gaps"), nrow(station_daily_gaps), 0),
219+
coverage_percent = ifelse(exists("complete_grid") && exists("all_station_data"),
220+
round(100 * nrow(all_station_data) / nrow(complete_grid), 1), 0)
221+
),
222+
municipal_forecasts = list(
223+
total_gaps = ifelse(exists("municipal_forecast_gaps"), nrow(municipal_forecast_gaps), 0),
224+
coverage_percent = ifelse(exists("forecast_grid") && exists("available_forecasts"),
225+
round(100 * nrow(available_forecasts) / nrow(forecast_grid), 1), 0)
226+
),
227+
hourly_continuity = list(
228+
recent_observations = ifelse(exists("recent_hourly"), nrow(recent_hourly), 0),
229+
avg_daily_count = ifelse(exists("daily_counts"), round(mean(daily_counts$N), 0), 0)
230+
)
231+
)
232+
233+
# Save detailed gaps if they exist
234+
if(nrow(station_daily_gaps) > 0) {
235+
fwrite(station_daily_gaps, paste0("data/output/gaps_station_daily_", ANALYSIS_DATE, ".csv"))
236+
cat("Station daily gaps saved to: data/output/gaps_station_daily_", ANALYSIS_DATE, ".csv\n")
237+
}
238+
239+
if(nrow(municipal_forecast_gaps) > 0) {
240+
fwrite(municipal_forecast_gaps, paste0("data/output/gaps_municipal_forecasts_", ANALYSIS_DATE, ".csv"))
241+
cat("Municipal forecast gaps saved to: data/output/gaps_municipal_forecasts_", ANALYSIS_DATE, ".csv\n")
242+
}
243+
244+
# Save summary as JSON for easy parsing
245+
jsonlite::write_json(gap_summary, paste0("data/output/gap_analysis_summary_", ANALYSIS_DATE, ".json"),
246+
pretty = TRUE, auto_unbox = TRUE)
247+
cat("Gap analysis summary saved to: data/output/gap_analysis_summary_", ANALYSIS_DATE, ".json\n")
248+
249+
cat("\n=== GAP ANALYSIS COMPLETE ===\n")
250+
cat("Analysis completed at:", format(Sys.time()), "\n")

code/collect_all_datasets_hybrid.R

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,46 @@ if(COLLECT_HOURLY_DATA) {
6767
}, error = function(e) {
6868
cat("❌ Dataset 3 failed:", e$message, "\n\n")
6969
dataset3_end = Sys.time()
70+
dataset3_end = Sys.time()
7071
times$hourly_data = as.numeric(difftime(dataset3_end, dataset3_start, units = "mins"))
71-
})
72-
}
7372
cat("✅ Dataset 3 completed in", round(times$hourly_data, 2), "minutes\n\n")
7473
}, error = function(e) {
7574
cat("❌ Dataset 3 failed:", e$message, "\n\n")
76-
times$hourly_data = NA
75+
dataset3_end = Sys.time()
76+
times$hourly_data = as.numeric(difftime(dataset3_end, dataset3_start, units = "mins"))
7777
})
7878
}
7979

80+
# === POST-COLLECTION GAP ANALYSIS AND MONITORING ===
81+
cat("=== POST-COLLECTION ANALYSIS ===\n")
82+
83+
# Run gap analysis
84+
cat("Running gap analysis...\n")
85+
tryCatch({
86+
source("code/check_data_gaps.R")
87+
cat("✅ Gap analysis completed.\n")
88+
}, error = function(e) {
89+
cat("❌ Gap analysis failed:", e$message, "\n")
90+
})
91+
92+
# Update data summary
93+
cat("Updating data summary...\n")
94+
tryCatch({
95+
source("code/generate_data_summary.R")
96+
cat("✅ Data summary updated.\n")
97+
}, error = function(e) {
98+
cat("❌ Data summary failed:", e$message, "\n")
99+
})
100+
101+
# Update README with current status
102+
cat("Updating README with latest data status...\n")
103+
tryCatch({
104+
source("code/update_readme_with_summary.R")
105+
cat("✅ README updated with current data status.\n")
106+
}, error = function(e) {
107+
cat("❌ README update failed:", e$message, "\n")
108+
})
109+
80110
# Final summary
81111
end_time = Sys.time()
82112
total_time = as.numeric(difftime(end_time, start_time, units = "mins"))

0 commit comments

Comments
 (0)