Add real station-municipality mapping and aggregation by muncipality

JohnPalmer · JohnPalmer · commit 3b2a32ffce25 · 2025-08-22T11:48:37.000+02:00
Real Station-Municipality Mapping: Now uses your provided station_point_municipaities_table.csv with actual geographic mapping of stations to municipalities using:

INDICATIVO (station code) → idema
NATCODE (municipality code) → municipio_id
NAMEUNIT (municipality name) → municipio_nombre
Proper Municipal Aggregation: Instead of just using Madrid as representative, it now:

Maps each weather station to its actual municipality
Aggregates station data within each municipality (taking mean values when multiple stations exist)
Combines with municipal forecasts for the same geographic areas
Production-Ready Output:

Uses municipality codes (NATCODE) for aggregation to handle any duplicate municipality names
Provides comprehensive coverage statistics showing how many municipalities have station data vs forecasts
Outputs to consistent output directory structure
Better Analytics: The summary now shows:

Number of municipalities with station data
Top municipalities by data coverage
Coverage gaps between historical station data and forecast data
Municipal-level statistics instead of just regional averages
diff --git a/code/aggregate_municipal_data.R b/code/aggregate_municipal_data.R
@@ -58,49 +58,49 @@ cat("Loaded", nrow(municipal_forecasts), "municipal forecast records.\n")
 cat("Forecast date range:", min(municipal_forecasts$fecha, na.rm=TRUE), "to", max(municipal_forecasts$fecha, na.rm=TRUE), "\n")
 cat("Number of municipalities:", length(unique(municipal_forecasts$municipio_id)), "\n")
 
-# Create simplified municipality-station mapping
-# This is a basic approach - in practice you'd want a proper geographic mapping
-cat("Creating municipality-station mapping...\n")
-
-# Get unique stations with their coordinates
-if("lat" %in% names(station_daily) && "lon" %in% names(station_daily)) {
-  station_coords = station_daily[, .(
-    lat = mean(as.numeric(value[measure == "lat"]), na.rm=TRUE),
-    lon = mean(as.numeric(value[measure == "lon"]), na.rm=TRUE)
-  ), by = idema][!is.na(lat) & !is.na(lon)]
-  
-  cat("Found coordinates for", nrow(station_coords), "stations.\n")
-} else {
-  # If no coordinates available, create a basic mapping based on major cities
-  cat("No station coordinates available. Using simplified mapping for major municipalities.\n")
-  
-  # Basic mapping for the municipalities we have forecasts for
-  municipality_station_map = data.table(
-    municipio_id = c("28079", "08019", "41091", "46250", "29067", "48020", "15030", 
-                     "07040", "35016", "38023", "50297", "33044", "30030", "17079", "03014"),
-    municipio_nombre = c("Madrid", "Barcelona", "Sevilla", "Valencia", "Málaga", "Bilbao", 
-                        "A Coruña", "Palma", "Las Palmas", "Santa Cruz de Tenerife", 
-                        "Zaragoza", "Oviedo", "Murcia", "Girona", "Alicante"),
-    # Assign representative stations (this would need proper geographic mapping in production)
-    primary_station = c("3195", "0076", "5783", "8416", "6155", "1082", "1387", 
-                       "B228", "C649", "C427", "9434", "1208", "7228", "0367", "8025")
-  )
-} 
-
-# For this simplified version, aggregate all stations to create "regional" summaries
-# that can be matched with municipal forecasts
-cat("Aggregating station data to regional summaries...\n")
-
-# Create daily regional aggregates (mean across all stations with data each day)
-regional_daily = station_daily[, .(
+# Load station-municipality mapping table
+cat("Loading station-municipality mapping...\n")
+
+if(!file.exists("data/input/station_point_municipaities_table.csv")) {
+  cat("ERROR: Station-municipality mapping file not found: data/input/station_point_municipaities_table.csv\n")
+  quit(save="no", status=1)
+}
+
+station_municipality_map = fread("data/input/station_point_municipaities_table.csv")
+cat("Loaded mapping for", nrow(station_municipality_map), "stations to municipalities.\n")
+cat("Number of municipalities:", length(unique(station_municipality_map$NATCODE)), "\n")
+
+# Create proper municipality-station aggregation
+cat("Aggregating station data by municipality...\n")
+# Join station data with municipality mapping
+cat("Joining station data with municipality mapping...\n")
+
+# Merge station data with municipality mapping
+station_daily_with_municipality = merge(
+  station_daily,
+  station_municipality_map[, .(idema = INDICATIVO, municipio_id = NATCODE, municipio_nombre = NAMEUNIT)],
+  by = "idema",
+  all.x = TRUE  # Keep all station data, even if not mapped
+)
+
+cat("Stations with municipality mapping:", 
+    length(unique(station_daily_with_municipality$idema[!is.na(station_daily_with_municipality$municipio_id)])), "\n")
+cat("Stations without mapping:", 
+    length(unique(station_daily_with_municipality$idema[is.na(station_daily_with_municipality$municipio_id)])), "\n")
+
+# Create municipal aggregates
+cat("Creating municipal aggregates from station data...\n")
+
+municipal_daily = station_daily_with_municipality[!is.na(municipio_id), .(
   value = mean(value, na.rm = TRUE),
   n_stations = length(unique(idema)),
   source = "station_aggregate"
-), by = .(date, measure)]
+), by = .(date, municipio_id, municipio_nombre, measure)]
 
-cat("Created regional daily aggregates:\n")
-cat("  Records:", nrow(regional_daily), "\n")
-cat("  Date range:", min(regional_daily$date), "to", max(regional_daily$date), "\n")
+cat("Created municipal daily aggregates:\n")
+cat("  Records:", nrow(municipal_daily), "\n")
+cat("  Municipalities:", length(unique(municipal_daily$municipio_id)), "\n")
+cat("  Date range:", min(municipal_daily$date), "to", max(municipal_daily$date), "\n")
 
 # Convert forecast data to compatible format
 cat("Processing municipal forecast data...\n")
@@ -139,27 +139,35 @@ cat("Reshaped forecast data:\n")
 cat("  Records:", nrow(forecast_reshaped), "\n")
 cat("  Variables:", paste(unique(forecast_reshaped$measure), collapse=", "), "\n")
 
-# For this simplified version, create a combined dataset using the major municipality (Madrid)
-# as representative, and combine with regional station aggregates
-madrid_forecasts = forecast_reshaped[municipio_id == "28079"]
-madrid_forecasts$municipio_id = NULL  # Remove for joining with regional data
+# Match forecast data with municipal aggregates  
+cat("Combining municipal station data with forecasts...\n")
+
+# Filter forecast data to only municipalities that have station data
+available_municipalities = unique(municipal_daily$municipio_id)
+forecast_filtered = forecast_reshaped[municipio_id %in% available_municipalities]
 
-# Combine regional station data with Madrid forecasts
-# Add municipality info to regional data (using Madrid as representative)
-regional_daily$municipio_id = "28079"
-regional_daily$municipio_nombre = "Madrid (Regional)"
+cat("Municipalities with both station data and forecasts:", 
+    length(intersect(unique(municipal_daily$municipio_id), unique(forecast_filtered$municipio_id))), "\n")
 
-# Find the overlap/gap between station data and forecasts
-station_end_date = max(regional_daily$date, na.rm=TRUE)
-forecast_start_date = min(madrid_forecasts$date, na.rm=TRUE)
+# Find the overlap/gap between station data and forecasts by municipality
+overlap_summary = municipal_daily[, .(
+  station_end_date = max(date, na.rm=TRUE),
+  station_start_date = min(date, na.rm=TRUE)
+), by = municipio_id]
 
-cat("Station data ends:", station_end_date, "\n")
-cat("Forecast data starts:", forecast_start_date, "\n")
+forecast_summary = forecast_filtered[, .(
+  forecast_start_date = min(date, na.rm=TRUE),
+  forecast_end_date = max(date, na.rm=TRUE)  
+), by = municipio_id]
 
-# Combine datasets
+coverage_summary = merge(overlap_summary, forecast_summary, by = "municipio_id", all = TRUE)
+cat("Coverage summary for municipalities:\n")
+print(coverage_summary[1:10])  # Show first 10 for brevity
+
+# Combine municipal station data with forecasts
 combined_municipal = rbind(
-  regional_daily[, .(date, municipio_id, municipio_nombre, measure, value, source)],
-  madrid_forecasts[, .(date, municipio_id, municipio_nombre, measure, value, source)],
+  municipal_daily[, .(date, municipio_id, municipio_nombre, measure, value, source)],
+  forecast_filtered[, .(date, municipio_id, municipio_nombre, measure, value, source)],
   fill = TRUE
 )
 
@@ -169,31 +177,53 @@ combined_municipal = combined_municipal[order(date, measure)]
 # Create summary
 cat("\n=== MUNICIPAL AGGREGATION SUMMARY ===\n")
 cat("Total municipal records:", nrow(combined_municipal), "\n")
+cat("Number of municipalities:", length(unique(combined_municipal$municipio_id)), "\n")
 cat("Date range:", min(combined_municipal$date, na.rm=TRUE), "to", max(combined_municipal$date, na.rm=TRUE), "\n")
 cat("Variables included:", paste(unique(combined_municipal$measure), collapse=", "), "\n")
 
 # Summary by source
 source_summary = combined_municipal[, .(
   records = .N,
+  municipalities = length(unique(municipio_id)),
   date_min = min(date, na.rm=TRUE),
   date_max = max(date, na.rm=TRUE)
 ), by = source]
 
+cat("\nBy source:\n")
 print(source_summary)
 
+# Summary by municipality (top 10 by record count)
+municipality_summary = combined_municipal[, .(
+  records = .N,
+  variables = length(unique(measure)),
+  date_min = min(date, na.rm=TRUE),
+  date_max = max(date, na.rm=TRUE)
+), by = .(municipio_id, municipio_nombre)][order(-records)]
+
+cat("\nTop 10 municipalities by record count:\n")
+print(municipality_summary[1:10])
+
 # Summary by variable
 variable_summary = combined_municipal[, .(
   records = .N,
+  municipalities = length(unique(municipio_id)),
   date_min = min(date, na.rm=TRUE),
   date_max = max(date, na.rm=TRUE)
 ), by = measure]
 
+cat("\nBy variable:\n")
 print(variable_summary)
 
 # Save the combined municipal data
-output_file = "data/spain_weather_municipal_combined.csv.gz"
+output_file = "data/output/municipal_combined.csv.gz"
 fwrite(combined_municipal, output_file)
 
+cat("\n=== AGGREGATION COMPLETE ===\n")
+cat("Municipal aggregated data saved to:", output_file, "\n")
+cat("File size:", round(file.size(output_file)/1024/1024, 1), "MB\n")
+cat("Total municipalities:", length(unique(combined_municipal$municipio_id)), "\n")
+cat("Date coverage:", min(combined_municipal$date, na.rm=TRUE), "to", max(combined_municipal$date, na.rm=TRUE), "\n")
+
 cat("\n=== MUNICIPAL AGGREGATION COMPLETE ===\n")
 cat("Municipal combined data saved to:", output_file, "\n")
 cat("File size:", round(file.size(output_file)/1024/1024, 1), "MB\n")
diff --git a/code/check_station_info.R b/code/check_station_info.R
@@ -0,0 +1,118 @@
+#!/usr/bin/env Rscript
+
+# Check what station metadata we can extract from existing observation data
+library(curl)
+library(jsonlite)
+library(dplyr)
+
+# Load API keys
+source("auth/keys.R")
+
+# Set up curl handle with API key
+h = new_handle()
+handle_setopt(h, customrequest = "GET")
+handle_setheaders(h, "api_key" = get_current_api_key())
+
+cat("Checking station metadata from current observation data...\n\n")
+
+# Get current observations (this works)
+req = curl_fetch_memory('https://opendata.aemet.es/opendata/api/observacion/convencional/todas', handle=h)
+
+if(req$status_code == 200) {
+  response_content = fromJSON(rawToChar(req$content))
+  cat("✅ Successfully got observations data URL\n")
+  
+  # Get the actual data
+  data_req = curl_fetch_memory(response_content$datos)
+  if(data_req$status_code == 200) {
+    # Handle encoding issues by setting UTF-8
+    raw_content <- rawToChar(data_req$content)
+    Encoding(raw_content) <- "UTF-8"
+    station_data = fromJSON(raw_content)
+    cat("✅ Successfully retrieved station observation data\n")
+    cat("📊 Number of stations:", length(station_data), "\n\n")
+    
+    if(length(station_data) > 0) {
+      # Convert to data frame for easier analysis
+      df <- bind_rows(station_data)
+      
+      cat("🔍 Available fields in observation data:\n")
+      print(names(df))
+      cat("\n")
+      
+      # Look for location-related fields
+      location_fields <- names(df)[grepl("ubi|provincia|munic|ciudad|localidad|lon|lat|alt", names(df), ignore.case = TRUE)]
+      cat("📍 Location-related fields found:", paste(location_fields, collapse = ", "), "\n\n")
+      
+      # Show sample of station identifiers and any location info
+      station_info <- df %>% 
+        select(any_of(c("idema", "ubi", "provincia", names(df)[grepl("lon|lat|alt", names(df), ignore.case = TRUE)]))) %>%
+        distinct() %>%
+        head(10)
+      
+      cat("📋 Sample station information:\n")
+      print(station_info)
+      
+      # Check unique provinces if available
+      if("provincia" %in% names(df)) {
+        cat("\n🗺️ Available provinces:\n")
+        print(sort(unique(df$provincia)))
+      }
+      
+      cat("\n🏢 Total unique stations:", length(unique(df$idema)), "\n")
+      
+    } else {
+      cat("❌ No station data found\n")
+    }
+  } else {
+    cat("❌ Failed to fetch observation data - Status:", data_req$status_code, "\n")
+  }
+} else {
+  cat("❌ Failed to get observations URL - Status:", req$status_code, "\n")
+}
+
+# Also check if we can get more detailed station info by trying some other endpoints
+cat("\n", rep("=", 50), "\n", sep="")
+cat("Trying alternative station info endpoints...\n\n")
+
+alternative_endpoints <- c(
+  "maestro/estacion/todas",
+  "inventario/estaciones", 
+  "inventario/climatologico/estaciones",
+  "inventario/observacion/estaciones",
+  "maestro/inventario/estaciones"
+)
+
+for(endpoint in alternative_endpoints) {
+  cat("Testing:", endpoint, "\n")
+  
+  tryCatch({
+    url <- paste0('https://opendata.aemet.es/opendata/api/', endpoint)
+    req = curl_fetch_memory(url, handle=h)
+    
+    if(req$status_code == 200) {
+      cat("  ✅ SUCCESS!\n")
+      response_content = fromJSON(rawToChar(req$content))
+      
+      if("datos" %in% names(response_content)) {
+        cat("  📊 Has data URL - attempting to fetch...\n")
+        data_req = curl_fetch_memory(response_content$datos)
+        if(data_req$status_code == 200) {
+          station_meta = fromJSON(rawToChar(data_req$content))
+          cat("  📈 Retrieved", length(station_meta), "records\n")
+          
+          if(length(station_meta) > 0) {
+            cat("  🔍 Fields:", paste(names(station_meta[[1]]), collapse = ", "), "\n")
+          }
+        }
+      }
+    } else {
+      cat("  ❌ Status:", req$status_code, "\n")
+    }
+    
+  }, error = function(e) {
+    cat("  ❌ ERROR:", e$message, "\n")
+  })
+  
+  cat("\n")
+}
diff --git a/code/test_station_metadata.R b/code/test_station_metadata.R
@@ -0,0 +1,81 @@
+#!/usr/bin/env Rscript
+
+# Test script to explore AEMET API for station metadata
+# Looking for endpoints that provide station locations/municipality mappings
+
+library(curl)
+library(jsonlite)
+
+# Load API keys
+source("auth/keys.R")
+
+# Set up curl handle with API key
+h = new_handle()
+handle_setopt(h, customrequest = "GET")
+handle_setheaders(h, "api_key" = get_current_api_key())
+
+cat("Testing AEMET API endpoints for station metadata...\n\n")
+
+# Test different potential endpoints for station information
+test_endpoints <- c(
+  "maestro/estacion",  # Master station list
+  "estaciones",        # Stations
+  "estaciones/todas",  # All stations  
+  "maestro/estaciones", # Master stations
+  "observacion/convencional/estaciones", # Observation stations
+  "valores/climatologicos/estaciones",   # Climatological stations
+  "red/estaciones"     # Station network
+)
+
+for(endpoint in test_endpoints) {
+  cat("Testing endpoint:", endpoint, "\n")
+  
+  tryCatch({
+    url <- paste0('https://opendata.aemet.es/opendata/api/', endpoint)
+    req = curl_fetch_memory(url, handle=h)
+    
+    if(req$status_code == 200) {
+      response_content = fromJSON(rawToChar(req$content))
+      cat("  ✅ SUCCESS - Status:", req$status_code, "\n")
+      
+      if("datos" %in% names(response_content)) {
+        cat("  📊 Has 'datos' field - fetching actual data...\n")
+        
+        # Get the actual data
+        data_req = curl_fetch_memory(response_content$datos)
+        if(data_req$status_code == 200) {
+          station_data = fromJSON(rawToChar(data_req$content))
+          cat("  📈 Data retrieved successfully\n")
+          cat("  📋 Number of records:", length(station_data), "\n")
+          
+          if(length(station_data) > 0) {
+            # Show structure of first record
+            cat("  🔍 First record structure:\n")
+            str(station_data[[1]])
+            cat("  📍 Available fields:", paste(names(station_data[[1]]), collapse = ", "), "\n")
+            
+            # Look for municipality or location fields
+            location_fields <- names(station_data[[1]])[grepl("munic|provincia|ciudad|localidad|ubicacion|lon|lat", names(station_data[[1]]), ignore.case = TRUE)]
+            if(length(location_fields) > 0) {
+              cat("  🎯 FOUND LOCATION FIELDS:", paste(location_fields, collapse = ", "), "\n")
+            }
+          }
+        } else {
+          cat("  ❌ Failed to fetch data - Status:", data_req$status_code, "\n")
+        }
+      } else {
+        cat("  📋 Direct response fields:", paste(names(response_content), collapse = ", "), "\n")
+      }
+      
+    } else {
+      cat("  ❌ FAILED - Status:", req$status_code, "\n")
+    }
+    
+  }, error = function(e) {
+    cat("  ❌ ERROR:", e$message, "\n")
+  })
+  
+  cat("\n")
+}
+
+cat("Station metadata exploration complete.\n")
diff --git a/data/input/station_point_municipaities_table.csv b/data/input/station_point_municipaities_table.csv