|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +# Correct variable standardization based on official documentation |
| 4 | +# Uses the proper naming from docs/variable_standardization.md |
| 5 | + |
| 6 | +library(dplyr, warn.conflicts = FALSE) |
| 7 | + |
| 8 | +# Function to safely rename columns |
| 9 | +safe_rename_columns <- function(data, variable_map) { |
| 10 | + current_cols <- colnames(data) |
| 11 | + rename_list <- list() |
| 12 | + |
| 13 | + for (old_name in current_cols) { |
| 14 | + if (old_name %in% names(variable_map)) { |
| 15 | + new_name <- variable_map[[old_name]] |
| 16 | + # Only rename if new name doesn't already exist |
| 17 | + if (!new_name %in% colnames(data)) { |
| 18 | + rename_list[[old_name]] <- new_name |
| 19 | + cat(" Will rename:", old_name, "->", new_name, "\n") |
| 20 | + } else { |
| 21 | + cat(" Skipping:", old_name, "-> target", new_name, "already exists\n") |
| 22 | + } |
| 23 | + } |
| 24 | + } |
| 25 | + |
| 26 | + # Apply renames |
| 27 | + if (length(rename_list) > 0) { |
| 28 | + for (old_name in names(rename_list)) { |
| 29 | + colnames(data)[colnames(data) == old_name] <- rename_list[[old_name]] |
| 30 | + } |
| 31 | + } |
| 32 | + |
| 33 | + return(data) |
| 34 | +} |
| 35 | + |
| 36 | +# CORRECTED variable mappings based on documentation |
| 37 | +daily_station_variables <- list( |
| 38 | + # Core identifiers - keep both for now |
| 39 | + "fecha" = "date", |
| 40 | + "indicativo" = "station_id", |
| 41 | + "idema" = "station_id", |
| 42 | + "nombre" = "station_name", |
| 43 | + "provincia" = "province", |
| 44 | + "altitud" = "altitude", |
| 45 | + |
| 46 | + # Temperature variables - correct names |
| 47 | + "tmed" = "temp_mean", |
| 48 | + "tmin" = "temp_min", |
| 49 | + "tmax" = "temp_max", |
| 50 | + "horatmin" = "time_temp_min", |
| 51 | + "horatmax" = "time_temp_max", |
| 52 | + |
| 53 | + # Precipitation |
| 54 | + "prec" = "precipitation", |
| 55 | + |
| 56 | + # Wind variables - correct names from docs |
| 57 | + "dir" = "wind_direction", |
| 58 | + "velmedia" = "wind_speed", # NOT wind_speed_mean |
| 59 | + "racha" = "wind_gust", # NOT wind_gust_max |
| 60 | + "horaracha" = "time_wind_gust", # NOT time_wind_gust_max |
| 61 | + |
| 62 | + # Atmospheric pressure |
| 63 | + "presMax" = "pressure_max", |
| 64 | + "horaPresMax" = "time_pressure_max", |
| 65 | + "presMin" = "pressure_min", |
| 66 | + "horaPresMin" = "time_pressure_min", |
| 67 | + |
| 68 | + # Humidity |
| 69 | + "hrMedia" = "humidity_mean", |
| 70 | + "hrMax" = "humidity_max", |
| 71 | + "horaHrMax" = "time_humidity_max", |
| 72 | + "hrMin" = "humidity_min", |
| 73 | + "horaHrMin" = "time_humidity_min", |
| 74 | + |
| 75 | + # Solar radiation - correct name from docs |
| 76 | + "sol" = "solar_hours", # NOT solar_radiation |
| 77 | + |
| 78 | + # Fix incorrect previous renames |
| 79 | + "wind_speed_mean" = "wind_speed", |
| 80 | + "wind_gust_max" = "wind_gust", |
| 81 | + "time_wind_gust_max" = "time_wind_gust", |
| 82 | + "solar_radiation" = "solar_hours", |
| 83 | + |
| 84 | + # Quality control flags |
| 85 | + "temp_range_ok" = "qc_temp_range", |
| 86 | + "temp_realistic" = "qc_temp_realistic", |
| 87 | + "prec_realistic" = "qc_prec_realistic", |
| 88 | + |
| 89 | + # Metadata |
| 90 | + "collected_at" = "collection_timestamp", |
| 91 | + "processed_at" = "processing_timestamp", |
| 92 | + "source" = "data_source", |
| 93 | + "n_observations" = "observation_count" |
| 94 | +) |
| 95 | + |
| 96 | +municipal_variables <- list( |
| 97 | + # Core identifiers - correct names |
| 98 | + "municipio_id" = "municipality_id", |
| 99 | + "municipio_nombre" = "municipality_name", |
| 100 | + "municipio" = "municipality_id", |
| 101 | + "municipio_code" = "municipality_id", |
| 102 | + "provincia" = "province", |
| 103 | + "fecha" = "date", |
| 104 | + "elaborado" = "forecast_issued_at", |
| 105 | + |
| 106 | + # Temperature variables |
| 107 | + "temp_avg" = "temp_mean", |
| 108 | + "tmed_municipal" = "temp_mean", |
| 109 | + "temp_max" = "temp_max", |
| 110 | + "tmax_municipal" = "temp_max", |
| 111 | + "temp_min" = "temp_min", |
| 112 | + "tmin_municipal" = "temp_min", |
| 113 | + |
| 114 | + # Humidity - correct names from docs |
| 115 | + "humid_max" = "humidity_max", # NOT hrMax |
| 116 | + "humid_min" = "humidity_min", # NOT hrMin |
| 117 | + "hrMedia_municipal" = "humidity_mean", |
| 118 | + |
| 119 | + # Wind |
| 120 | + "wind_speed" = "wind_speed", |
| 121 | + "velmedia_municipal" = "wind_speed", |
| 122 | + |
| 123 | + # Data source and priority |
| 124 | + "data_source" = "data_source", |
| 125 | + "source" = "data_source", |
| 126 | + "priority" = "data_priority", |
| 127 | + |
| 128 | + # Quality control |
| 129 | + "temp_range_ok" = "qc_temp_range", |
| 130 | + "temp_realistic" = "qc_temp_realistic", |
| 131 | + |
| 132 | + # Metadata |
| 133 | + "collected_at" = "collection_timestamp", |
| 134 | + "processed_at" = "processing_timestamp", |
| 135 | + "n_stations" = "n_stations" |
| 136 | +) |
| 137 | + |
| 138 | +hourly_variables <- list( |
| 139 | + # Core identifiers |
| 140 | + "idema" = "station_id", |
| 141 | + "fint" = "datetime", |
| 142 | + "date" = "date", |
| 143 | + "measure" = "variable_type", |
| 144 | + "value" = "value" |
| 145 | +) |
| 146 | + |
| 147 | +# Process each dataset with correct mapping |
| 148 | +datasets <- list( |
| 149 | + list(file = "daily_station_historical.csv", vars = daily_station_variables), |
| 150 | + list(file = "daily_municipal_extended.csv", vars = municipal_variables), |
| 151 | + list(file = "hourly_station_ongoing.csv", vars = hourly_variables) |
| 152 | +) |
| 153 | + |
| 154 | +data_dir <- "/home/j.palmer/research/weather-data-collector-spain/data/output" |
| 155 | + |
| 156 | +for (dataset in datasets) { |
| 157 | + dataset_file <- dataset$file |
| 158 | + variable_map <- dataset$vars |
| 159 | + file_path <- file.path(data_dir, dataset_file) |
| 160 | + |
| 161 | + if (!file.exists(file_path)) { |
| 162 | + cat("Warning: File", dataset_file, "not found\n") |
| 163 | + next |
| 164 | + } |
| 165 | + |
| 166 | + cat("\n=== Processing:", dataset_file, "===\n") |
| 167 | + |
| 168 | + tryCatch({ |
| 169 | + # Read the data |
| 170 | + data <- read.csv(file_path, stringsAsFactors = FALSE) |
| 171 | + |
| 172 | + cat(" Original dimensions:", nrow(data), "rows,", ncol(data), "columns\n") |
| 173 | + cat(" Original columns:", paste(head(colnames(data), 10), collapse = ", "), "...\n") |
| 174 | + |
| 175 | + # Create backup |
| 176 | + backup_file <- paste0(file_path, ".backup_corrected_", format(Sys.time(), "%Y%m%d_%H%M%S")) |
| 177 | + file.copy(file_path, backup_file) |
| 178 | + cat(" Backup created:", basename(backup_file), "\n") |
| 179 | + |
| 180 | + # Apply correct standardization |
| 181 | + data_corrected <- safe_rename_columns(data, variable_map) |
| 182 | + |
| 183 | + cat(" Final dimensions:", nrow(data_corrected), "rows,", ncol(data_corrected), "columns\n") |
| 184 | + cat(" Final columns:", paste(head(colnames(data_corrected), 10), collapse = ", "), "...\n") |
| 185 | + |
| 186 | + # Write corrected version |
| 187 | + write.csv(data_corrected, file_path, row.names = FALSE) |
| 188 | + cat(" ✅ Successfully corrected", dataset_file, "\n") |
| 189 | + |
| 190 | + }, error = function(e) { |
| 191 | + cat(" ❌ Error processing", dataset_file, ":", e$message, "\n") |
| 192 | + }) |
| 193 | +} |
| 194 | + |
| 195 | +cat("\n🎯 Variable standardization corrected according to documentation!\n") |
0 commit comments