Skip to content

Commit 3b2a32f

Browse files
committed
Add real station-municipality mapping and aggregation by muncipality
Real Station-Municipality Mapping: Now uses your provided station_point_municipaities_table.csv with actual geographic mapping of stations to municipalities using: INDICATIVO (station code) → idema NATCODE (municipality code) → municipio_id NAMEUNIT (municipality name) → municipio_nombre Proper Municipal Aggregation: Instead of just using Madrid as representative, it now: Maps each weather station to its actual municipality Aggregates station data within each municipality (taking mean values when multiple stations exist) Combines with municipal forecasts for the same geographic areas Production-Ready Output: Uses municipality codes (NATCODE) for aggregation to handle any duplicate municipality names Provides comprehensive coverage statistics showing how many municipalities have station data vs forecasts Outputs to consistent output directory structure Better Analytics: The summary now shows: Number of municipalities with station data Top municipalities by data coverage Coverage gaps between historical station data and forecast data Municipal-level statistics instead of just regional averages
1 parent 05e7fca commit 3b2a32f

File tree

4 files changed

+4586
-56
lines changed

4 files changed

+4586
-56
lines changed

code/aggregate_municipal_data.R

Lines changed: 86 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -58,49 +58,49 @@ cat("Loaded", nrow(municipal_forecasts), "municipal forecast records.\n")
5858
cat("Forecast date range:", min(municipal_forecasts$fecha, na.rm=TRUE), "to", max(municipal_forecasts$fecha, na.rm=TRUE), "\n")
5959
cat("Number of municipalities:", length(unique(municipal_forecasts$municipio_id)), "\n")
6060

61-
# Create simplified municipality-station mapping
62-
# This is a basic approach - in practice you'd want a proper geographic mapping
63-
cat("Creating municipality-station mapping...\n")
64-
65-
# Get unique stations with their coordinates
66-
if("lat" %in% names(station_daily) && "lon" %in% names(station_daily)) {
67-
station_coords = station_daily[, .(
68-
lat = mean(as.numeric(value[measure == "lat"]), na.rm=TRUE),
69-
lon = mean(as.numeric(value[measure == "lon"]), na.rm=TRUE)
70-
), by = idema][!is.na(lat) & !is.na(lon)]
71-
72-
cat("Found coordinates for", nrow(station_coords), "stations.\n")
73-
} else {
74-
# If no coordinates available, create a basic mapping based on major cities
75-
cat("No station coordinates available. Using simplified mapping for major municipalities.\n")
76-
77-
# Basic mapping for the municipalities we have forecasts for
78-
municipality_station_map = data.table(
79-
municipio_id = c("28079", "08019", "41091", "46250", "29067", "48020", "15030",
80-
"07040", "35016", "38023", "50297", "33044", "30030", "17079", "03014"),
81-
municipio_nombre = c("Madrid", "Barcelona", "Sevilla", "Valencia", "Málaga", "Bilbao",
82-
"A Coruña", "Palma", "Las Palmas", "Santa Cruz de Tenerife",
83-
"Zaragoza", "Oviedo", "Murcia", "Girona", "Alicante"),
84-
# Assign representative stations (this would need proper geographic mapping in production)
85-
primary_station = c("3195", "0076", "5783", "8416", "6155", "1082", "1387",
86-
"B228", "C649", "C427", "9434", "1208", "7228", "0367", "8025")
87-
)
88-
}
89-
90-
# For this simplified version, aggregate all stations to create "regional" summaries
91-
# that can be matched with municipal forecasts
92-
cat("Aggregating station data to regional summaries...\n")
93-
94-
# Create daily regional aggregates (mean across all stations with data each day)
95-
regional_daily = station_daily[, .(
61+
# Load station-municipality mapping table
62+
cat("Loading station-municipality mapping...\n")
63+
64+
if(!file.exists("data/input/station_point_municipaities_table.csv")) {
65+
cat("ERROR: Station-municipality mapping file not found: data/input/station_point_municipaities_table.csv\n")
66+
quit(save="no", status=1)
67+
}
68+
69+
station_municipality_map = fread("data/input/station_point_municipaities_table.csv")
70+
cat("Loaded mapping for", nrow(station_municipality_map), "stations to municipalities.\n")
71+
cat("Number of municipalities:", length(unique(station_municipality_map$NATCODE)), "\n")
72+
73+
# Create proper municipality-station aggregation
74+
cat("Aggregating station data by municipality...\n")
75+
# Join station data with municipality mapping
76+
cat("Joining station data with municipality mapping...\n")
77+
78+
# Merge station data with municipality mapping
79+
station_daily_with_municipality = merge(
80+
station_daily,
81+
station_municipality_map[, .(idema = INDICATIVO, municipio_id = NATCODE, municipio_nombre = NAMEUNIT)],
82+
by = "idema",
83+
all.x = TRUE # Keep all station data, even if not mapped
84+
)
85+
86+
cat("Stations with municipality mapping:",
87+
length(unique(station_daily_with_municipality$idema[!is.na(station_daily_with_municipality$municipio_id)])), "\n")
88+
cat("Stations without mapping:",
89+
length(unique(station_daily_with_municipality$idema[is.na(station_daily_with_municipality$municipio_id)])), "\n")
90+
91+
# Create municipal aggregates
92+
cat("Creating municipal aggregates from station data...\n")
93+
94+
municipal_daily = station_daily_with_municipality[!is.na(municipio_id), .(
9695
value = mean(value, na.rm = TRUE),
9796
n_stations = length(unique(idema)),
9897
source = "station_aggregate"
99-
), by = .(date, measure)]
98+
), by = .(date, municipio_id, municipio_nombre, measure)]
10099

101-
cat("Created regional daily aggregates:\n")
102-
cat(" Records:", nrow(regional_daily), "\n")
103-
cat(" Date range:", min(regional_daily$date), "to", max(regional_daily$date), "\n")
100+
cat("Created municipal daily aggregates:\n")
101+
cat(" Records:", nrow(municipal_daily), "\n")
102+
cat(" Municipalities:", length(unique(municipal_daily$municipio_id)), "\n")
103+
cat(" Date range:", min(municipal_daily$date), "to", max(municipal_daily$date), "\n")
104104

105105
# Convert forecast data to compatible format
106106
cat("Processing municipal forecast data...\n")
@@ -139,27 +139,35 @@ cat("Reshaped forecast data:\n")
139139
cat(" Records:", nrow(forecast_reshaped), "\n")
140140
cat(" Variables:", paste(unique(forecast_reshaped$measure), collapse=", "), "\n")
141141

142-
# For this simplified version, create a combined dataset using the major municipality (Madrid)
143-
# as representative, and combine with regional station aggregates
144-
madrid_forecasts = forecast_reshaped[municipio_id == "28079"]
145-
madrid_forecasts$municipio_id = NULL # Remove for joining with regional data
142+
# Match forecast data with municipal aggregates
143+
cat("Combining municipal station data with forecasts...\n")
144+
145+
# Filter forecast data to only municipalities that have station data
146+
available_municipalities = unique(municipal_daily$municipio_id)
147+
forecast_filtered = forecast_reshaped[municipio_id %in% available_municipalities]
146148

147-
# Combine regional station data with Madrid forecasts
148-
# Add municipality info to regional data (using Madrid as representative)
149-
regional_daily$municipio_id = "28079"
150-
regional_daily$municipio_nombre = "Madrid (Regional)"
149+
cat("Municipalities with both station data and forecasts:",
150+
length(intersect(unique(municipal_daily$municipio_id), unique(forecast_filtered$municipio_id))), "\n")
151151

152-
# Find the overlap/gap between station data and forecasts
153-
station_end_date = max(regional_daily$date, na.rm=TRUE)
154-
forecast_start_date = min(madrid_forecasts$date, na.rm=TRUE)
152+
# Find the overlap/gap between station data and forecasts by municipality
153+
overlap_summary = municipal_daily[, .(
154+
station_end_date = max(date, na.rm=TRUE),
155+
station_start_date = min(date, na.rm=TRUE)
156+
), by = municipio_id]
155157

156-
cat("Station data ends:", station_end_date, "\n")
157-
cat("Forecast data starts:", forecast_start_date, "\n")
158+
forecast_summary = forecast_filtered[, .(
159+
forecast_start_date = min(date, na.rm=TRUE),
160+
forecast_end_date = max(date, na.rm=TRUE)
161+
), by = municipio_id]
158162

159-
# Combine datasets
163+
coverage_summary = merge(overlap_summary, forecast_summary, by = "municipio_id", all = TRUE)
164+
cat("Coverage summary for municipalities:\n")
165+
print(coverage_summary[1:10]) # Show first 10 for brevity
166+
167+
# Combine municipal station data with forecasts
160168
combined_municipal = rbind(
161-
regional_daily[, .(date, municipio_id, municipio_nombre, measure, value, source)],
162-
madrid_forecasts[, .(date, municipio_id, municipio_nombre, measure, value, source)],
169+
municipal_daily[, .(date, municipio_id, municipio_nombre, measure, value, source)],
170+
forecast_filtered[, .(date, municipio_id, municipio_nombre, measure, value, source)],
163171
fill = TRUE
164172
)
165173

@@ -169,31 +177,53 @@ combined_municipal = combined_municipal[order(date, measure)]
169177
# Create summary
170178
cat("\n=== MUNICIPAL AGGREGATION SUMMARY ===\n")
171179
cat("Total municipal records:", nrow(combined_municipal), "\n")
180+
cat("Number of municipalities:", length(unique(combined_municipal$municipio_id)), "\n")
172181
cat("Date range:", min(combined_municipal$date, na.rm=TRUE), "to", max(combined_municipal$date, na.rm=TRUE), "\n")
173182
cat("Variables included:", paste(unique(combined_municipal$measure), collapse=", "), "\n")
174183

175184
# Summary by source
176185
source_summary = combined_municipal[, .(
177186
records = .N,
187+
municipalities = length(unique(municipio_id)),
178188
date_min = min(date, na.rm=TRUE),
179189
date_max = max(date, na.rm=TRUE)
180190
), by = source]
181191

192+
cat("\nBy source:\n")
182193
print(source_summary)
183194

195+
# Summary by municipality (top 10 by record count)
196+
municipality_summary = combined_municipal[, .(
197+
records = .N,
198+
variables = length(unique(measure)),
199+
date_min = min(date, na.rm=TRUE),
200+
date_max = max(date, na.rm=TRUE)
201+
), by = .(municipio_id, municipio_nombre)][order(-records)]
202+
203+
cat("\nTop 10 municipalities by record count:\n")
204+
print(municipality_summary[1:10])
205+
184206
# Summary by variable
185207
variable_summary = combined_municipal[, .(
186208
records = .N,
209+
municipalities = length(unique(municipio_id)),
187210
date_min = min(date, na.rm=TRUE),
188211
date_max = max(date, na.rm=TRUE)
189212
), by = measure]
190213

214+
cat("\nBy variable:\n")
191215
print(variable_summary)
192216

193217
# Save the combined municipal data
194-
output_file = "data/spain_weather_municipal_combined.csv.gz"
218+
output_file = "data/output/municipal_combined.csv.gz"
195219
fwrite(combined_municipal, output_file)
196220

221+
cat("\n=== AGGREGATION COMPLETE ===\n")
222+
cat("Municipal aggregated data saved to:", output_file, "\n")
223+
cat("File size:", round(file.size(output_file)/1024/1024, 1), "MB\n")
224+
cat("Total municipalities:", length(unique(combined_municipal$municipio_id)), "\n")
225+
cat("Date coverage:", min(combined_municipal$date, na.rm=TRUE), "to", max(combined_municipal$date, na.rm=TRUE), "\n")
226+
197227
cat("\n=== MUNICIPAL AGGREGATION COMPLETE ===\n")
198228
cat("Municipal combined data saved to:", output_file, "\n")
199229
cat("File size:", round(file.size(output_file)/1024/1024, 1), "MB\n")

code/check_station_info.R

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Check what station metadata we can extract from existing observation data
4+
library(curl)
5+
library(jsonlite)
6+
library(dplyr)
7+
8+
# Load API keys
9+
source("auth/keys.R")
10+
11+
# Set up curl handle with API key
12+
h = new_handle()
13+
handle_setopt(h, customrequest = "GET")
14+
handle_setheaders(h, "api_key" = get_current_api_key())
15+
16+
cat("Checking station metadata from current observation data...\n\n")
17+
18+
# Get current observations (this works)
19+
req = curl_fetch_memory('https://opendata.aemet.es/opendata/api/observacion/convencional/todas', handle=h)
20+
21+
if(req$status_code == 200) {
22+
response_content = fromJSON(rawToChar(req$content))
23+
cat("✅ Successfully got observations data URL\n")
24+
25+
# Get the actual data
26+
data_req = curl_fetch_memory(response_content$datos)
27+
if(data_req$status_code == 200) {
28+
# Handle encoding issues by setting UTF-8
29+
raw_content <- rawToChar(data_req$content)
30+
Encoding(raw_content) <- "UTF-8"
31+
station_data = fromJSON(raw_content)
32+
cat("✅ Successfully retrieved station observation data\n")
33+
cat("📊 Number of stations:", length(station_data), "\n\n")
34+
35+
if(length(station_data) > 0) {
36+
# Convert to data frame for easier analysis
37+
df <- bind_rows(station_data)
38+
39+
cat("🔍 Available fields in observation data:\n")
40+
print(names(df))
41+
cat("\n")
42+
43+
# Look for location-related fields
44+
location_fields <- names(df)[grepl("ubi|provincia|munic|ciudad|localidad|lon|lat|alt", names(df), ignore.case = TRUE)]
45+
cat("📍 Location-related fields found:", paste(location_fields, collapse = ", "), "\n\n")
46+
47+
# Show sample of station identifiers and any location info
48+
station_info <- df %>%
49+
select(any_of(c("idema", "ubi", "provincia", names(df)[grepl("lon|lat|alt", names(df), ignore.case = TRUE)]))) %>%
50+
distinct() %>%
51+
head(10)
52+
53+
cat("📋 Sample station information:\n")
54+
print(station_info)
55+
56+
# Check unique provinces if available
57+
if("provincia" %in% names(df)) {
58+
cat("\n🗺️ Available provinces:\n")
59+
print(sort(unique(df$provincia)))
60+
}
61+
62+
cat("\n🏢 Total unique stations:", length(unique(df$idema)), "\n")
63+
64+
} else {
65+
cat("❌ No station data found\n")
66+
}
67+
} else {
68+
cat("❌ Failed to fetch observation data - Status:", data_req$status_code, "\n")
69+
}
70+
} else {
71+
cat("❌ Failed to get observations URL - Status:", req$status_code, "\n")
72+
}
73+
74+
# Also check if we can get more detailed station info by trying some other endpoints
75+
cat("\n", rep("=", 50), "\n", sep="")
76+
cat("Trying alternative station info endpoints...\n\n")
77+
78+
alternative_endpoints <- c(
79+
"maestro/estacion/todas",
80+
"inventario/estaciones",
81+
"inventario/climatologico/estaciones",
82+
"inventario/observacion/estaciones",
83+
"maestro/inventario/estaciones"
84+
)
85+
86+
for(endpoint in alternative_endpoints) {
87+
cat("Testing:", endpoint, "\n")
88+
89+
tryCatch({
90+
url <- paste0('https://opendata.aemet.es/opendata/api/', endpoint)
91+
req = curl_fetch_memory(url, handle=h)
92+
93+
if(req$status_code == 200) {
94+
cat(" ✅ SUCCESS!\n")
95+
response_content = fromJSON(rawToChar(req$content))
96+
97+
if("datos" %in% names(response_content)) {
98+
cat(" 📊 Has data URL - attempting to fetch...\n")
99+
data_req = curl_fetch_memory(response_content$datos)
100+
if(data_req$status_code == 200) {
101+
station_meta = fromJSON(rawToChar(data_req$content))
102+
cat(" 📈 Retrieved", length(station_meta), "records\n")
103+
104+
if(length(station_meta) > 0) {
105+
cat(" 🔍 Fields:", paste(names(station_meta[[1]]), collapse = ", "), "\n")
106+
}
107+
}
108+
}
109+
} else {
110+
cat(" ❌ Status:", req$status_code, "\n")
111+
}
112+
113+
}, error = function(e) {
114+
cat(" ❌ ERROR:", e$message, "\n")
115+
})
116+
117+
cat("\n")
118+
}

code/test_station_metadata.R

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Test script to explore AEMET API for station metadata
4+
# Looking for endpoints that provide station locations/municipality mappings
5+
6+
library(curl)
7+
library(jsonlite)
8+
9+
# Load API keys
10+
source("auth/keys.R")
11+
12+
# Set up curl handle with API key
13+
h = new_handle()
14+
handle_setopt(h, customrequest = "GET")
15+
handle_setheaders(h, "api_key" = get_current_api_key())
16+
17+
cat("Testing AEMET API endpoints for station metadata...\n\n")
18+
19+
# Test different potential endpoints for station information
20+
test_endpoints <- c(
21+
"maestro/estacion", # Master station list
22+
"estaciones", # Stations
23+
"estaciones/todas", # All stations
24+
"maestro/estaciones", # Master stations
25+
"observacion/convencional/estaciones", # Observation stations
26+
"valores/climatologicos/estaciones", # Climatological stations
27+
"red/estaciones" # Station network
28+
)
29+
30+
for(endpoint in test_endpoints) {
31+
cat("Testing endpoint:", endpoint, "\n")
32+
33+
tryCatch({
34+
url <- paste0('https://opendata.aemet.es/opendata/api/', endpoint)
35+
req = curl_fetch_memory(url, handle=h)
36+
37+
if(req$status_code == 200) {
38+
response_content = fromJSON(rawToChar(req$content))
39+
cat(" ✅ SUCCESS - Status:", req$status_code, "\n")
40+
41+
if("datos" %in% names(response_content)) {
42+
cat(" 📊 Has 'datos' field - fetching actual data...\n")
43+
44+
# Get the actual data
45+
data_req = curl_fetch_memory(response_content$datos)
46+
if(data_req$status_code == 200) {
47+
station_data = fromJSON(rawToChar(data_req$content))
48+
cat(" 📈 Data retrieved successfully\n")
49+
cat(" 📋 Number of records:", length(station_data), "\n")
50+
51+
if(length(station_data) > 0) {
52+
# Show structure of first record
53+
cat(" 🔍 First record structure:\n")
54+
str(station_data[[1]])
55+
cat(" 📍 Available fields:", paste(names(station_data[[1]]), collapse = ", "), "\n")
56+
57+
# Look for municipality or location fields
58+
location_fields <- names(station_data[[1]])[grepl("munic|provincia|ciudad|localidad|ubicacion|lon|lat", names(station_data[[1]]), ignore.case = TRUE)]
59+
if(length(location_fields) > 0) {
60+
cat(" 🎯 FOUND LOCATION FIELDS:", paste(location_fields, collapse = ", "), "\n")
61+
}
62+
}
63+
} else {
64+
cat(" ❌ Failed to fetch data - Status:", data_req$status_code, "\n")
65+
}
66+
} else {
67+
cat(" 📋 Direct response fields:", paste(names(response_content), collapse = ", "), "\n")
68+
}
69+
70+
} else {
71+
cat(" ❌ FAILED - Status:", req$status_code, "\n")
72+
}
73+
74+
}, error = function(e) {
75+
cat(" ❌ ERROR:", e$message, "\n")
76+
})
77+
78+
cat("\n")
79+
}
80+
81+
cat("Station metadata exploration complete.\n")

0 commit comments

Comments
 (0)