@@ -58,49 +58,49 @@ cat("Loaded", nrow(municipal_forecasts), "municipal forecast records.\n")
5858cat(" Forecast date range:" , min(municipal_forecasts $ fecha , na.rm = TRUE ), " to" , max(municipal_forecasts $ fecha , na.rm = TRUE ), " \n " )
5959cat(" Number of municipalities:" , length(unique(municipal_forecasts $ municipio_id )), " \n " )
6060
61- # Create simplified municipality-station mapping
62- # This is a basic approach - in practice you'd want a proper geographic mapping
63- cat(" Creating municipality-station mapping...\n " )
64-
65- # Get unique stations with their coordinates
66- if (" lat" %in% names(station_daily ) && " lon" %in% names(station_daily )) {
67- station_coords = station_daily [, .(
68- lat = mean(as.numeric(value [measure == " lat" ]), na.rm = TRUE ),
69- lon = mean(as.numeric(value [measure == " lon" ]), na.rm = TRUE )
70- ), by = idema ][! is.na(lat ) & ! is.na(lon )]
71-
72- cat(" Found coordinates for" , nrow(station_coords ), " stations.\n " )
73- } else {
74- # If no coordinates available, create a basic mapping based on major cities
75- cat(" No station coordinates available. Using simplified mapping for major municipalities.\n " )
76-
77- # Basic mapping for the municipalities we have forecasts for
78- municipality_station_map = data.table(
79- municipio_id = c(" 28079" , " 08019" , " 41091" , " 46250" , " 29067" , " 48020" , " 15030" ,
80- " 07040" , " 35016" , " 38023" , " 50297" , " 33044" , " 30030" , " 17079" , " 03014" ),
81- municipio_nombre = c(" Madrid" , " Barcelona" , " Sevilla" , " Valencia" , " Málaga" , " Bilbao" ,
82- " A Coruña" , " Palma" , " Las Palmas" , " Santa Cruz de Tenerife" ,
83- " Zaragoza" , " Oviedo" , " Murcia" , " Girona" , " Alicante" ),
84- # Assign representative stations (this would need proper geographic mapping in production)
85- primary_station = c(" 3195" , " 0076" , " 5783" , " 8416" , " 6155" , " 1082" , " 1387" ,
86- " B228" , " C649" , " C427" , " 9434" , " 1208" , " 7228" , " 0367" , " 8025" )
87- )
88- }
89-
90- # For this simplified version, aggregate all stations to create "regional" summaries
91- # that can be matched with municipal forecasts
92- cat(" Aggregating station data to regional summaries...\n " )
93-
94- # Create daily regional aggregates (mean across all stations with data each day)
95- regional_daily = station_daily [, .(
61+ # Load station-municipality mapping table
62+ cat(" Loading station-municipality mapping...\n " )
63+
64+ if (! file.exists(" data/input/station_point_municipaities_table.csv" )) {
65+ cat(" ERROR: Station-municipality mapping file not found: data/input/station_point_municipaities_table.csv\n " )
66+ quit(save = " no" , status = 1 )
67+ }
68+
69+ station_municipality_map = fread(" data/input/station_point_municipaities_table.csv" )
70+ cat(" Loaded mapping for" , nrow(station_municipality_map ), " stations to municipalities.\n " )
71+ cat(" Number of municipalities:" , length(unique(station_municipality_map $ NATCODE )), " \n " )
72+
73+ # Create proper municipality-station aggregation
74+ cat(" Aggregating station data by municipality...\n " )
75+ # Join station data with municipality mapping
76+ cat(" Joining station data with municipality mapping...\n " )
77+
78+ # Merge station data with municipality mapping
79+ station_daily_with_municipality = merge(
80+ station_daily ,
81+ station_municipality_map [, .(idema = INDICATIVO , municipio_id = NATCODE , municipio_nombre = NAMEUNIT )],
82+ by = " idema" ,
83+ all.x = TRUE # Keep all station data, even if not mapped
84+ )
85+
86+ cat(" Stations with municipality mapping:" ,
87+ length(unique(station_daily_with_municipality $ idema [! is.na(station_daily_with_municipality $ municipio_id )])), " \n " )
88+ cat(" Stations without mapping:" ,
89+ length(unique(station_daily_with_municipality $ idema [is.na(station_daily_with_municipality $ municipio_id )])), " \n " )
90+
91+ # Create municipal aggregates
92+ cat(" Creating municipal aggregates from station data...\n " )
93+
94+ municipal_daily = station_daily_with_municipality [! is.na(municipio_id ), .(
9695 value = mean(value , na.rm = TRUE ),
9796 n_stations = length(unique(idema )),
9897 source = " station_aggregate"
99- ), by = .(date , measure )]
98+ ), by = .(date , municipio_id , municipio_nombre , measure )]
10099
101- cat(" Created regional daily aggregates:\n " )
102- cat(" Records:" , nrow(regional_daily ), " \n " )
103- cat(" Date range:" , min(regional_daily $ date ), " to" , max(regional_daily $ date ), " \n " )
100+ cat(" Created municipal daily aggregates:\n " )
101+ cat(" Records:" , nrow(municipal_daily ), " \n " )
102+ cat(" Municipalities:" , length(unique(municipal_daily $ municipio_id )), " \n " )
103+ cat(" Date range:" , min(municipal_daily $ date ), " to" , max(municipal_daily $ date ), " \n " )
104104
105105# Convert forecast data to compatible format
106106cat(" Processing municipal forecast data...\n " )
@@ -139,27 +139,35 @@ cat("Reshaped forecast data:\n")
139139cat(" Records:" , nrow(forecast_reshaped ), " \n " )
140140cat(" Variables:" , paste(unique(forecast_reshaped $ measure ), collapse = " , " ), " \n " )
141141
142- # For this simplified version, create a combined dataset using the major municipality (Madrid)
143- # as representative, and combine with regional station aggregates
144- madrid_forecasts = forecast_reshaped [municipio_id == " 28079" ]
145- madrid_forecasts $ municipio_id = NULL # Remove for joining with regional data
142+ # Match forecast data with municipal aggregates
143+ cat(" Combining municipal station data with forecasts...\n " )
144+
145+ # Filter forecast data to only municipalities that have station data
146+ available_municipalities = unique(municipal_daily $ municipio_id )
147+ forecast_filtered = forecast_reshaped [municipio_id %in% available_municipalities ]
146148
147- # Combine regional station data with Madrid forecasts
148- # Add municipality info to regional data (using Madrid as representative)
149- regional_daily $ municipio_id = " 28079"
150- regional_daily $ municipio_nombre = " Madrid (Regional)"
149+ cat(" Municipalities with both station data and forecasts:" ,
150+ length(intersect(unique(municipal_daily $ municipio_id ), unique(forecast_filtered $ municipio_id ))), " \n " )
151151
152- # Find the overlap/gap between station data and forecasts
153- station_end_date = max(regional_daily $ date , na.rm = TRUE )
154- forecast_start_date = min(madrid_forecasts $ date , na.rm = TRUE )
152+ # Find the overlap/gap between station data and forecasts by municipality
153+ overlap_summary = municipal_daily [, .(
154+ station_end_date = max(date , na.rm = TRUE ),
155+ station_start_date = min(date , na.rm = TRUE )
156+ ), by = municipio_id ]
155157
156- cat(" Station data ends:" , station_end_date , " \n " )
157- cat(" Forecast data starts:" , forecast_start_date , " \n " )
158+ forecast_summary = forecast_filtered [, .(
159+ forecast_start_date = min(date , na.rm = TRUE ),
160+ forecast_end_date = max(date , na.rm = TRUE )
161+ ), by = municipio_id ]
158162
159- # Combine datasets
163+ coverage_summary = merge(overlap_summary , forecast_summary , by = " municipio_id" , all = TRUE )
164+ cat(" Coverage summary for municipalities:\n " )
165+ print(coverage_summary [1 : 10 ]) # Show first 10 for brevity
166+
167+ # Combine municipal station data with forecasts
160168combined_municipal = rbind(
161- regional_daily [, .(date , municipio_id , municipio_nombre , measure , value , source )],
162- madrid_forecasts [, .(date , municipio_id , municipio_nombre , measure , value , source )],
169+ municipal_daily [, .(date , municipio_id , municipio_nombre , measure , value , source )],
170+ forecast_filtered [, .(date , municipio_id , municipio_nombre , measure , value , source )],
163171 fill = TRUE
164172)
165173
@@ -169,31 +177,53 @@ combined_municipal = combined_municipal[order(date, measure)]
169177# Create summary
170178cat(" \n === MUNICIPAL AGGREGATION SUMMARY ===\n " )
171179cat(" Total municipal records:" , nrow(combined_municipal ), " \n " )
180+ cat(" Number of municipalities:" , length(unique(combined_municipal $ municipio_id )), " \n " )
172181cat(" Date range:" , min(combined_municipal $ date , na.rm = TRUE ), " to" , max(combined_municipal $ date , na.rm = TRUE ), " \n " )
173182cat(" Variables included:" , paste(unique(combined_municipal $ measure ), collapse = " , " ), " \n " )
174183
175184# Summary by source
176185source_summary = combined_municipal [, .(
177186 records = .N ,
187+ municipalities = length(unique(municipio_id )),
178188 date_min = min(date , na.rm = TRUE ),
179189 date_max = max(date , na.rm = TRUE )
180190), by = source ]
181191
192+ cat(" \n By source:\n " )
182193print(source_summary )
183194
195+ # Summary by municipality (top 10 by record count)
196+ municipality_summary = combined_municipal [, .(
197+ records = .N ,
198+ variables = length(unique(measure )),
199+ date_min = min(date , na.rm = TRUE ),
200+ date_max = max(date , na.rm = TRUE )
201+ ), by = .(municipio_id , municipio_nombre )][order(- records )]
202+
203+ cat(" \n Top 10 municipalities by record count:\n " )
204+ print(municipality_summary [1 : 10 ])
205+
184206# Summary by variable
185207variable_summary = combined_municipal [, .(
186208 records = .N ,
209+ municipalities = length(unique(municipio_id )),
187210 date_min = min(date , na.rm = TRUE ),
188211 date_max = max(date , na.rm = TRUE )
189212), by = measure ]
190213
214+ cat(" \n By variable:\n " )
191215print(variable_summary )
192216
193217# Save the combined municipal data
194- output_file = " data/spain_weather_municipal_combined .csv.gz"
218+ output_file = " data/output/municipal_combined .csv.gz"
195219fwrite(combined_municipal , output_file )
196220
221+ cat(" \n === AGGREGATION COMPLETE ===\n " )
222+ cat(" Municipal aggregated data saved to:" , output_file , " \n " )
223+ cat(" File size:" , round(file.size(output_file )/ 1024 / 1024 , 1 ), " MB\n " )
224+ cat(" Total municipalities:" , length(unique(combined_municipal $ municipio_id )), " \n " )
225+ cat(" Date coverage:" , min(combined_municipal $ date , na.rm = TRUE ), " to" , max(combined_municipal $ date , na.rm = TRUE ), " \n " )
226+
197227cat(" \n === MUNICIPAL AGGREGATION COMPLETE ===\n " )
198228cat(" Municipal combined data saved to:" , output_file , " \n " )
199229cat(" File size:" , round(file.size(output_file )/ 1024 / 1024 , 1 ), " MB\n " )
0 commit comments