Mosquito-Alert
diff --git a/‎scripts/r/consolidate_municipal_duplicates.R‎
Lines changed: 161 additions & 0 deletions b/‎scripts/r/consolidate_municipal_duplicates.R‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎scripts/r/correct_variable_standardization.R‎
Lines changed: 195 additions & 0 deletions b/‎scripts/r/correct_variable_standardization.R‎
Lines changed: 195 additions & 0 deletions
@@ -0,0 +1,161 @@
+#!/usr/bin/env Rscript
+
+# Proper consolidation of duplicate columns in municipal dataset
+# This script examines the content of duplicate columns and consolidates them intelligently
+
+library(data.table)
+library(dplyr)
+
+cat("=== FIXING MUNICIPAL DATASET DUPLICATE COLUMNS ===\n")
+
+# Load the data with fread to see actual column structure
+data <- fread("data/output/daily_municipal_extended.csv")
+cat("Original data: ", nrow(data), "rows, ", ncol(data), "columns\n")
+
+# Get original column names before any processing
+original_cols <- colnames(data)
+cat("Original columns:\n")
+print(original_cols)
+
+# Create a clean version by examining each type of duplicate
+
+# 1. MUNICIPALITY ID CONSOLIDATION
+cat("\n=== CONSOLIDATING MUNICIPALITY ID ===\n")
+muni_id_positions <- which(original_cols == "municipality_id")
+cat("municipality_id appears at positions:", paste(muni_id_positions, collapse=", "), "\n")
+
+if (length(muni_id_positions) > 1) {
+  # Examine content of each municipality_id column
+  for (i in seq_along(muni_id_positions)) {
+    pos <- muni_id_positions[i]
+    col_data <- data[[pos]]
+    non_na_count <- sum(!is.na(col_data))
+    sample_vals <- unique(col_data[!is.na(col_data)])[1:5]
+    cat("  Position", pos, ": ", non_na_count, "non-NA values, sample:", paste(sample_vals, collapse=", "), "\n")
+  }
+  
+  # Keep the column with most non-NA values, or first if tied
+  best_muni_col <- muni_id_positions[1]
+  for (pos in muni_id_positions) {
+    if (sum(!is.na(data[[pos]])) > sum(!is.na(data[[best_muni_col]]))) {
+      best_muni_col <- pos
+    }
+  }
+  
+  cat("  Keeping municipality_id from position", best_muni_col, "\n")
+  
+  # Create new clean dataset starting with this column
+  clean_data <- data.frame(municipality_id = data[[best_muni_col]])
+} else {
+  clean_data <- data.frame(municipality_id = data[["municipality_id"]])
+}
+
+# 2. ADD OTHER UNIQUE COLUMNS (non-duplicated ones)
+cat("\n=== ADDING UNIQUE COLUMNS ===\n")
+unique_cols <- original_cols[!original_cols %in% c("municipality_id", "temp_mean")]
+for (col in unique_cols) {
+  if (col %in% original_cols) {
+    clean_data[[col]] <- data[[col]]
+    cat("  Added:", col, "\n")
+  }
+}
+
+# 3. TEMP_MEAN CONSOLIDATION  
+cat("\n=== CONSOLIDATING TEMP_MEAN ===\n")
+temp_mean_positions <- which(original_cols == "temp_mean")
+cat("temp_mean appears at positions:", paste(temp_mean_positions, collapse=", "), "\n")
+
+if (length(temp_mean_positions) > 1) {
+  # Examine content of each temp_mean column
+  for (i in seq_along(temp_mean_positions)) {
+    pos <- temp_mean_positions[i]
+    col_data <- data[[pos]]
+    non_na_count <- sum(!is.na(col_data))
+    sample_vals <- unique(col_data[!is.na(col_data)])[1:5]
+    cat("  Position", pos, ": ", non_na_count, "non-NA values, sample:", paste(sample_vals, collapse=", "), "\n")
+  }
+  
+  # Keep the column with most non-NA values
+  best_temp_col <- temp_mean_positions[1]
+  for (pos in temp_mean_positions) {
+    if (sum(!is.na(data[[pos]])) > sum(!is.na(data[[best_temp_col]]))) {
+      best_temp_col <- pos
+    }
+  }
+  
+  cat("  Keeping temp_mean from position", best_temp_col, "\n")
+  clean_data$temp_mean <- data[[best_temp_col]]
+} else if (length(temp_mean_positions) == 1) {
+  clean_data$temp_mean <- data[[temp_mean_positions[1]]]
+}
+
+# 4. HANDLE REMAINING UNMAPPED COLUMNS
+cat("\n=== MAPPING REMAINING COLUMNS ===\n")
+# Map remaining municipal-specific column names to standard names
+remaining_mappings <- list(
+  "tmax_municipal" = "temp_max",
+  "tmin_municipal" = "temp_min", 
+  "velmedia_municipal" = "wind_speed",
+  "hrMedia_municipal" = "humidity_mean"
+)
+
+for (old_name in names(remaining_mappings)) {
+  new_name <- remaining_mappings[[old_name]]
+  if (old_name %in% colnames(clean_data)) {
+    # If target column doesn't exist or is empty, use this one
+    if (!new_name %in% colnames(clean_data) || all(is.na(clean_data[[new_name]]))) {
+      clean_data[[new_name]] <- clean_data[[old_name]]
+      clean_data[[old_name]] <- NULL  # Remove old column
+      cat("  Mapped:", old_name, "->", new_name, "\n")
+    } else {
+      cat("  Skipped:", old_name, "- target", new_name, "already exists with data\n")
+    }
+  }
+}
+
+# 5. FINAL COLUMN ORGANIZATION
+cat("\n=== FINAL ORGANIZATION ===\n")
+
+# Define the proper order according to documentation
+standard_order <- c(
+  "municipality_id", "municipality_name", "province", "date",
+  "temp_mean", "temp_max", "temp_min", 
+  "humidity_mean", "humidity_max", "humidity_min",
+  "wind_speed", "forecast_issued_at", "data_source", "data_priority",
+  "collection_timestamp", "processing_timestamp",
+  "qc_temp_range", "qc_temp_realistic", "n_stations", "source"
+)
+
+# Reorder columns - put standard ones first, then any extras
+available_standard <- intersect(standard_order, colnames(clean_data))
+extra_cols <- setdiff(colnames(clean_data), standard_order)
+
+final_order <- c(available_standard, extra_cols)
+clean_data <- clean_data[, final_order, drop = FALSE]
+
+cat("Final dataset: ", nrow(clean_data), "rows, ", ncol(clean_data), "columns\n")
+cat("Final columns:\n")
+print(colnames(clean_data))
+
+# Check for any remaining numbered columns
+numbered_cols <- grep("\\.[0-9]+$", colnames(clean_data), value = TRUE)
+if (length(numbered_cols) > 0) {
+  cat("❌ WARNING: Still have numbered columns:", paste(numbered_cols, collapse = ", "), "\n")
+} else {
+  cat("✅ No numbered columns remaining\n")
+}
+
+# 6. SAVE CLEANED VERSION
+cat("\n=== SAVING CLEANED DATA ===\n")
+
+# Create backup
+backup_file <- paste0("data/output/daily_municipal_extended.csv.backup_before_consolidation_", 
+                     format(Sys.time(), "%Y%m%d_%H%M%S"))
+file.copy("data/output/daily_municipal_extended.csv", backup_file)
+cat("Backup created:", basename(backup_file), "\n")
+
+# Write clean version
+write.csv(clean_data, "data/output/daily_municipal_extended.csv", row.names = FALSE)
+cat("✅ Cleaned municipal dataset saved\n")
+
+cat("\n🎯 MUNICIPAL DATASET CONSOLIDATION COMPLETE\n")
@@ -0,0 +1,195 @@
+#!/usr/bin/env Rscript
+
+# Correct variable standardization based on official documentation
+# Uses the proper naming from docs/variable_standardization.md
+
+library(dplyr, warn.conflicts = FALSE)
+
+# Function to safely rename columns
+safe_rename_columns <- function(data, variable_map) {
+  current_cols <- colnames(data)
+  rename_list <- list()
+  
+  for (old_name in current_cols) {
+    if (old_name %in% names(variable_map)) {
+      new_name <- variable_map[[old_name]]
+      # Only rename if new name doesn't already exist
+      if (!new_name %in% colnames(data)) {
+        rename_list[[old_name]] <- new_name
+        cat("    Will rename:", old_name, "->", new_name, "\n")
+      } else {
+        cat("    Skipping:", old_name, "-> target", new_name, "already exists\n")
+      }
+    }
+  }
+  
+  # Apply renames
+  if (length(rename_list) > 0) {
+    for (old_name in names(rename_list)) {
+      colnames(data)[colnames(data) == old_name] <- rename_list[[old_name]]
+    }
+  }
+  
+  return(data)
+}
+
+# CORRECTED variable mappings based on documentation
+daily_station_variables <- list(
+  # Core identifiers - keep both for now
+  "fecha" = "date",
+  "indicativo" = "station_id", 
+  "idema" = "station_id",
+  "nombre" = "station_name",
+  "provincia" = "province",
+  "altitud" = "altitude",
+  
+  # Temperature variables - correct names
+  "tmed" = "temp_mean",
+  "tmin" = "temp_min", 
+  "tmax" = "temp_max",
+  "horatmin" = "time_temp_min",
+  "horatmax" = "time_temp_max",
+  
+  # Precipitation
+  "prec" = "precipitation",
+  
+  # Wind variables - correct names from docs
+  "dir" = "wind_direction",
+  "velmedia" = "wind_speed",  # NOT wind_speed_mean
+  "racha" = "wind_gust",      # NOT wind_gust_max
+  "horaracha" = "time_wind_gust",  # NOT time_wind_gust_max
+  
+  # Atmospheric pressure
+  "presMax" = "pressure_max",
+  "horaPresMax" = "time_pressure_max", 
+  "presMin" = "pressure_min",
+  "horaPresMin" = "time_pressure_min",
+  
+  # Humidity
+  "hrMedia" = "humidity_mean",
+  "hrMax" = "humidity_max",
+  "horaHrMax" = "time_humidity_max",
+  "hrMin" = "humidity_min", 
+  "horaHrMin" = "time_humidity_min",
+  
+  # Solar radiation - correct name from docs
+  "sol" = "solar_hours",  # NOT solar_radiation
+  
+  # Fix incorrect previous renames
+  "wind_speed_mean" = "wind_speed",
+  "wind_gust_max" = "wind_gust", 
+  "time_wind_gust_max" = "time_wind_gust",
+  "solar_radiation" = "solar_hours",
+  
+  # Quality control flags
+  "temp_range_ok" = "qc_temp_range",
+  "temp_realistic" = "qc_temp_realistic", 
+  "prec_realistic" = "qc_prec_realistic",
+  
+  # Metadata
+  "collected_at" = "collection_timestamp",
+  "processed_at" = "processing_timestamp",
+  "source" = "data_source",
+  "n_observations" = "observation_count"
+)
+
+municipal_variables <- list(
+  # Core identifiers - correct names
+  "municipio_id" = "municipality_id",
+  "municipio_nombre" = "municipality_name", 
+  "municipio" = "municipality_id",
+  "municipio_code" = "municipality_id",
+  "provincia" = "province",
+  "fecha" = "date",
+  "elaborado" = "forecast_issued_at",
+  
+  # Temperature variables
+  "temp_avg" = "temp_mean",
+  "tmed_municipal" = "temp_mean",
+  "temp_max" = "temp_max",
+  "tmax_municipal" = "temp_max",
+  "temp_min" = "temp_min",
+  "tmin_municipal" = "temp_min",
+  
+  # Humidity - correct names from docs
+  "humid_max" = "humidity_max",   # NOT hrMax
+  "humid_min" = "humidity_min",   # NOT hrMin  
+  "hrMedia_municipal" = "humidity_mean",
+  
+  # Wind
+  "wind_speed" = "wind_speed",
+  "velmedia_municipal" = "wind_speed",
+  
+  # Data source and priority
+  "data_source" = "data_source",
+  "source" = "data_source", 
+  "priority" = "data_priority",
+  
+  # Quality control
+  "temp_range_ok" = "qc_temp_range",
+  "temp_realistic" = "qc_temp_realistic",
+  
+  # Metadata
+  "collected_at" = "collection_timestamp",
+  "processed_at" = "processing_timestamp",
+  "n_stations" = "n_stations"
+)
+
+hourly_variables <- list(
+  # Core identifiers
+  "idema" = "station_id",
+  "fint" = "datetime", 
+  "date" = "date",
+  "measure" = "variable_type",
+  "value" = "value"
+)
+
+# Process each dataset with correct mapping
+datasets <- list(
+  list(file = "daily_station_historical.csv", vars = daily_station_variables),
+  list(file = "daily_municipal_extended.csv", vars = municipal_variables),
+  list(file = "hourly_station_ongoing.csv", vars = hourly_variables)
+)
+
+data_dir <- "/home/j.palmer/research/weather-data-collector-spain/data/output"
+
+for (dataset in datasets) {
+  dataset_file <- dataset$file
+  variable_map <- dataset$vars
+  file_path <- file.path(data_dir, dataset_file)
+  
+  if (!file.exists(file_path)) {
+    cat("Warning: File", dataset_file, "not found\n")
+    next
+  }
+  
+  cat("\n=== Processing:", dataset_file, "===\n")
+  
+  tryCatch({
+    # Read the data
+    data <- read.csv(file_path, stringsAsFactors = FALSE)
+    
+    cat("  Original dimensions:", nrow(data), "rows,", ncol(data), "columns\n")
+    cat("  Original columns:", paste(head(colnames(data), 10), collapse = ", "), "...\n")
+    
+    # Create backup
+    backup_file <- paste0(file_path, ".backup_corrected_", format(Sys.time(), "%Y%m%d_%H%M%S"))
+    file.copy(file_path, backup_file)
+    cat("  Backup created:", basename(backup_file), "\n")
+    
+    # Apply correct standardization
+    data_corrected <- safe_rename_columns(data, variable_map)
+    
+    cat("  Final dimensions:", nrow(data_corrected), "rows,", ncol(data_corrected), "columns\n")
+    cat("  Final columns:", paste(head(colnames(data_corrected), 10), collapse = ", "), "...\n")
+    
+    # Write corrected version
+    write.csv(data_corrected, file_path, row.names = FALSE)
+    cat("  ✅ Successfully corrected", dataset_file, "\n")
+    
+  }, error = function(e) {
+    cat("  ❌ Error processing", dataset_file, ":", e$message, "\n")
+  })
+}
+
+cat("\n🎯 Variable standardization corrected according to documentation!\n")