Skip to content

Commit 083fdfc

Browse files
committed
Fixes existing data and the aggregation scripts that produce it.
1 parent 626e0d0 commit 083fdfc

File tree

4 files changed

+689
-0
lines changed

4 files changed

+689
-0
lines changed
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Proper consolidation of duplicate columns in municipal dataset
4+
# This script examines the content of duplicate columns and consolidates them intelligently
5+
6+
library(data.table)
7+
library(dplyr)
8+
9+
cat("=== FIXING MUNICIPAL DATASET DUPLICATE COLUMNS ===\n")
10+
11+
# Load the data with fread to see actual column structure
12+
data <- fread("data/output/daily_municipal_extended.csv")
13+
cat("Original data: ", nrow(data), "rows, ", ncol(data), "columns\n")
14+
15+
# Get original column names before any processing
16+
original_cols <- colnames(data)
17+
cat("Original columns:\n")
18+
print(original_cols)
19+
20+
# Create a clean version by examining each type of duplicate
21+
22+
# 1. MUNICIPALITY ID CONSOLIDATION
23+
cat("\n=== CONSOLIDATING MUNICIPALITY ID ===\n")
24+
muni_id_positions <- which(original_cols == "municipality_id")
25+
cat("municipality_id appears at positions:", paste(muni_id_positions, collapse=", "), "\n")
26+
27+
if (length(muni_id_positions) > 1) {
28+
# Examine content of each municipality_id column
29+
for (i in seq_along(muni_id_positions)) {
30+
pos <- muni_id_positions[i]
31+
col_data <- data[[pos]]
32+
non_na_count <- sum(!is.na(col_data))
33+
sample_vals <- unique(col_data[!is.na(col_data)])[1:5]
34+
cat(" Position", pos, ": ", non_na_count, "non-NA values, sample:", paste(sample_vals, collapse=", "), "\n")
35+
}
36+
37+
# Keep the column with most non-NA values, or first if tied
38+
best_muni_col <- muni_id_positions[1]
39+
for (pos in muni_id_positions) {
40+
if (sum(!is.na(data[[pos]])) > sum(!is.na(data[[best_muni_col]]))) {
41+
best_muni_col <- pos
42+
}
43+
}
44+
45+
cat(" Keeping municipality_id from position", best_muni_col, "\n")
46+
47+
# Create new clean dataset starting with this column
48+
clean_data <- data.frame(municipality_id = data[[best_muni_col]])
49+
} else {
50+
clean_data <- data.frame(municipality_id = data[["municipality_id"]])
51+
}
52+
53+
# 2. ADD OTHER UNIQUE COLUMNS (non-duplicated ones)
54+
cat("\n=== ADDING UNIQUE COLUMNS ===\n")
55+
unique_cols <- original_cols[!original_cols %in% c("municipality_id", "temp_mean")]
56+
for (col in unique_cols) {
57+
if (col %in% original_cols) {
58+
clean_data[[col]] <- data[[col]]
59+
cat(" Added:", col, "\n")
60+
}
61+
}
62+
63+
# 3. TEMP_MEAN CONSOLIDATION
64+
cat("\n=== CONSOLIDATING TEMP_MEAN ===\n")
65+
temp_mean_positions <- which(original_cols == "temp_mean")
66+
cat("temp_mean appears at positions:", paste(temp_mean_positions, collapse=", "), "\n")
67+
68+
if (length(temp_mean_positions) > 1) {
69+
# Examine content of each temp_mean column
70+
for (i in seq_along(temp_mean_positions)) {
71+
pos <- temp_mean_positions[i]
72+
col_data <- data[[pos]]
73+
non_na_count <- sum(!is.na(col_data))
74+
sample_vals <- unique(col_data[!is.na(col_data)])[1:5]
75+
cat(" Position", pos, ": ", non_na_count, "non-NA values, sample:", paste(sample_vals, collapse=", "), "\n")
76+
}
77+
78+
# Keep the column with most non-NA values
79+
best_temp_col <- temp_mean_positions[1]
80+
for (pos in temp_mean_positions) {
81+
if (sum(!is.na(data[[pos]])) > sum(!is.na(data[[best_temp_col]]))) {
82+
best_temp_col <- pos
83+
}
84+
}
85+
86+
cat(" Keeping temp_mean from position", best_temp_col, "\n")
87+
clean_data$temp_mean <- data[[best_temp_col]]
88+
} else if (length(temp_mean_positions) == 1) {
89+
clean_data$temp_mean <- data[[temp_mean_positions[1]]]
90+
}
91+
92+
# 4. HANDLE REMAINING UNMAPPED COLUMNS
93+
cat("\n=== MAPPING REMAINING COLUMNS ===\n")
94+
# Map remaining municipal-specific column names to standard names
95+
remaining_mappings <- list(
96+
"tmax_municipal" = "temp_max",
97+
"tmin_municipal" = "temp_min",
98+
"velmedia_municipal" = "wind_speed",
99+
"hrMedia_municipal" = "humidity_mean"
100+
)
101+
102+
for (old_name in names(remaining_mappings)) {
103+
new_name <- remaining_mappings[[old_name]]
104+
if (old_name %in% colnames(clean_data)) {
105+
# If target column doesn't exist or is empty, use this one
106+
if (!new_name %in% colnames(clean_data) || all(is.na(clean_data[[new_name]]))) {
107+
clean_data[[new_name]] <- clean_data[[old_name]]
108+
clean_data[[old_name]] <- NULL # Remove old column
109+
cat(" Mapped:", old_name, "->", new_name, "\n")
110+
} else {
111+
cat(" Skipped:", old_name, "- target", new_name, "already exists with data\n")
112+
}
113+
}
114+
}
115+
116+
# 5. FINAL COLUMN ORGANIZATION
117+
cat("\n=== FINAL ORGANIZATION ===\n")
118+
119+
# Define the proper order according to documentation
120+
standard_order <- c(
121+
"municipality_id", "municipality_name", "province", "date",
122+
"temp_mean", "temp_max", "temp_min",
123+
"humidity_mean", "humidity_max", "humidity_min",
124+
"wind_speed", "forecast_issued_at", "data_source", "data_priority",
125+
"collection_timestamp", "processing_timestamp",
126+
"qc_temp_range", "qc_temp_realistic", "n_stations", "source"
127+
)
128+
129+
# Reorder columns - put standard ones first, then any extras
130+
available_standard <- intersect(standard_order, colnames(clean_data))
131+
extra_cols <- setdiff(colnames(clean_data), standard_order)
132+
133+
final_order <- c(available_standard, extra_cols)
134+
clean_data <- clean_data[, final_order, drop = FALSE]
135+
136+
cat("Final dataset: ", nrow(clean_data), "rows, ", ncol(clean_data), "columns\n")
137+
cat("Final columns:\n")
138+
print(colnames(clean_data))
139+
140+
# Check for any remaining numbered columns
141+
numbered_cols <- grep("\\.[0-9]+$", colnames(clean_data), value = TRUE)
142+
if (length(numbered_cols) > 0) {
143+
cat("❌ WARNING: Still have numbered columns:", paste(numbered_cols, collapse = ", "), "\n")
144+
} else {
145+
cat("✅ No numbered columns remaining\n")
146+
}
147+
148+
# 6. SAVE CLEANED VERSION
149+
cat("\n=== SAVING CLEANED DATA ===\n")
150+
151+
# Create backup
152+
backup_file <- paste0("data/output/daily_municipal_extended.csv.backup_before_consolidation_",
153+
format(Sys.time(), "%Y%m%d_%H%M%S"))
154+
file.copy("data/output/daily_municipal_extended.csv", backup_file)
155+
cat("Backup created:", basename(backup_file), "\n")
156+
157+
# Write clean version
158+
write.csv(clean_data, "data/output/daily_municipal_extended.csv", row.names = FALSE)
159+
cat("✅ Cleaned municipal dataset saved\n")
160+
161+
cat("\n🎯 MUNICIPAL DATASET CONSOLIDATION COMPLETE\n")
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Correct variable standardization based on official documentation
4+
# Uses the proper naming from docs/variable_standardization.md
5+
6+
library(dplyr, warn.conflicts = FALSE)
7+
8+
# Function to safely rename columns
9+
safe_rename_columns <- function(data, variable_map) {
10+
current_cols <- colnames(data)
11+
rename_list <- list()
12+
13+
for (old_name in current_cols) {
14+
if (old_name %in% names(variable_map)) {
15+
new_name <- variable_map[[old_name]]
16+
# Only rename if new name doesn't already exist
17+
if (!new_name %in% colnames(data)) {
18+
rename_list[[old_name]] <- new_name
19+
cat(" Will rename:", old_name, "->", new_name, "\n")
20+
} else {
21+
cat(" Skipping:", old_name, "-> target", new_name, "already exists\n")
22+
}
23+
}
24+
}
25+
26+
# Apply renames
27+
if (length(rename_list) > 0) {
28+
for (old_name in names(rename_list)) {
29+
colnames(data)[colnames(data) == old_name] <- rename_list[[old_name]]
30+
}
31+
}
32+
33+
return(data)
34+
}
35+
36+
# CORRECTED variable mappings based on documentation
37+
daily_station_variables <- list(
38+
# Core identifiers - keep both for now
39+
"fecha" = "date",
40+
"indicativo" = "station_id",
41+
"idema" = "station_id",
42+
"nombre" = "station_name",
43+
"provincia" = "province",
44+
"altitud" = "altitude",
45+
46+
# Temperature variables - correct names
47+
"tmed" = "temp_mean",
48+
"tmin" = "temp_min",
49+
"tmax" = "temp_max",
50+
"horatmin" = "time_temp_min",
51+
"horatmax" = "time_temp_max",
52+
53+
# Precipitation
54+
"prec" = "precipitation",
55+
56+
# Wind variables - correct names from docs
57+
"dir" = "wind_direction",
58+
"velmedia" = "wind_speed", # NOT wind_speed_mean
59+
"racha" = "wind_gust", # NOT wind_gust_max
60+
"horaracha" = "time_wind_gust", # NOT time_wind_gust_max
61+
62+
# Atmospheric pressure
63+
"presMax" = "pressure_max",
64+
"horaPresMax" = "time_pressure_max",
65+
"presMin" = "pressure_min",
66+
"horaPresMin" = "time_pressure_min",
67+
68+
# Humidity
69+
"hrMedia" = "humidity_mean",
70+
"hrMax" = "humidity_max",
71+
"horaHrMax" = "time_humidity_max",
72+
"hrMin" = "humidity_min",
73+
"horaHrMin" = "time_humidity_min",
74+
75+
# Solar radiation - correct name from docs
76+
"sol" = "solar_hours", # NOT solar_radiation
77+
78+
# Fix incorrect previous renames
79+
"wind_speed_mean" = "wind_speed",
80+
"wind_gust_max" = "wind_gust",
81+
"time_wind_gust_max" = "time_wind_gust",
82+
"solar_radiation" = "solar_hours",
83+
84+
# Quality control flags
85+
"temp_range_ok" = "qc_temp_range",
86+
"temp_realistic" = "qc_temp_realistic",
87+
"prec_realistic" = "qc_prec_realistic",
88+
89+
# Metadata
90+
"collected_at" = "collection_timestamp",
91+
"processed_at" = "processing_timestamp",
92+
"source" = "data_source",
93+
"n_observations" = "observation_count"
94+
)
95+
96+
municipal_variables <- list(
97+
# Core identifiers - correct names
98+
"municipio_id" = "municipality_id",
99+
"municipio_nombre" = "municipality_name",
100+
"municipio" = "municipality_id",
101+
"municipio_code" = "municipality_id",
102+
"provincia" = "province",
103+
"fecha" = "date",
104+
"elaborado" = "forecast_issued_at",
105+
106+
# Temperature variables
107+
"temp_avg" = "temp_mean",
108+
"tmed_municipal" = "temp_mean",
109+
"temp_max" = "temp_max",
110+
"tmax_municipal" = "temp_max",
111+
"temp_min" = "temp_min",
112+
"tmin_municipal" = "temp_min",
113+
114+
# Humidity - correct names from docs
115+
"humid_max" = "humidity_max", # NOT hrMax
116+
"humid_min" = "humidity_min", # NOT hrMin
117+
"hrMedia_municipal" = "humidity_mean",
118+
119+
# Wind
120+
"wind_speed" = "wind_speed",
121+
"velmedia_municipal" = "wind_speed",
122+
123+
# Data source and priority
124+
"data_source" = "data_source",
125+
"source" = "data_source",
126+
"priority" = "data_priority",
127+
128+
# Quality control
129+
"temp_range_ok" = "qc_temp_range",
130+
"temp_realistic" = "qc_temp_realistic",
131+
132+
# Metadata
133+
"collected_at" = "collection_timestamp",
134+
"processed_at" = "processing_timestamp",
135+
"n_stations" = "n_stations"
136+
)
137+
138+
hourly_variables <- list(
139+
# Core identifiers
140+
"idema" = "station_id",
141+
"fint" = "datetime",
142+
"date" = "date",
143+
"measure" = "variable_type",
144+
"value" = "value"
145+
)
146+
147+
# Process each dataset with correct mapping
148+
datasets <- list(
149+
list(file = "daily_station_historical.csv", vars = daily_station_variables),
150+
list(file = "daily_municipal_extended.csv", vars = municipal_variables),
151+
list(file = "hourly_station_ongoing.csv", vars = hourly_variables)
152+
)
153+
154+
data_dir <- "/home/j.palmer/research/weather-data-collector-spain/data/output"
155+
156+
for (dataset in datasets) {
157+
dataset_file <- dataset$file
158+
variable_map <- dataset$vars
159+
file_path <- file.path(data_dir, dataset_file)
160+
161+
if (!file.exists(file_path)) {
162+
cat("Warning: File", dataset_file, "not found\n")
163+
next
164+
}
165+
166+
cat("\n=== Processing:", dataset_file, "===\n")
167+
168+
tryCatch({
169+
# Read the data
170+
data <- read.csv(file_path, stringsAsFactors = FALSE)
171+
172+
cat(" Original dimensions:", nrow(data), "rows,", ncol(data), "columns\n")
173+
cat(" Original columns:", paste(head(colnames(data), 10), collapse = ", "), "...\n")
174+
175+
# Create backup
176+
backup_file <- paste0(file_path, ".backup_corrected_", format(Sys.time(), "%Y%m%d_%H%M%S"))
177+
file.copy(file_path, backup_file)
178+
cat(" Backup created:", basename(backup_file), "\n")
179+
180+
# Apply correct standardization
181+
data_corrected <- safe_rename_columns(data, variable_map)
182+
183+
cat(" Final dimensions:", nrow(data_corrected), "rows,", ncol(data_corrected), "columns\n")
184+
cat(" Final columns:", paste(head(colnames(data_corrected), 10), collapse = ", "), "...\n")
185+
186+
# Write corrected version
187+
write.csv(data_corrected, file_path, row.names = FALSE)
188+
cat(" ✅ Successfully corrected", dataset_file, "\n")
189+
190+
}, error = function(e) {
191+
cat(" ❌ Error processing", dataset_file, ":", e$message, "\n")
192+
})
193+
}
194+
195+
cat("\n🎯 Variable standardization corrected according to documentation!\n")

0 commit comments

Comments
 (0)