|
| 1 | + |
| 2 | +R version 4.4.2 (2024-10-31) -- "Pile of Leaves" |
| 3 | +Copyright (C) 2024 The R Foundation for Statistical Computing |
| 4 | +Platform: aarch64-apple-darwin20 |
| 5 | + |
| 6 | +R is free software and comes with ABSOLUTELY NO WARRANTY. |
| 7 | +You are welcome to redistribute it under certain conditions. |
| 8 | +Type 'license()' or 'licence()' for distribution details. |
| 9 | + |
| 10 | + Natural language support but running in an English locale |
| 11 | + |
| 12 | +R is a collaborative project with many contributors. |
| 13 | +Type 'contributors()' for more information and |
| 14 | +'citation()' on how to cite R or R packages in publications. |
| 15 | + |
| 16 | +Type 'demo()' for some demos, 'help()' for on-line help, or |
| 17 | +'help.start()' for an HTML browser interface to help. |
| 18 | +Type 'q()' to quit R. |
| 19 | + |
| 20 | +- Project '~/research/weather-data-collector-spain' loaded. [renv 1.1.4] |
| 21 | +- The project is out-of-sync -- use `renv::status()` for details. |
| 22 | +> #!/usr/bin/env Rscript |
| 23 | +> |
| 24 | +> # aggregate_daily_station_data.R |
| 25 | +> # ------------------------------- |
| 26 | +> # Purpose: Create daily aggregated weather data by station from hourly observations |
| 27 | +> # |
| 28 | +> # This script processes the hourly expanded weather data to create daily summaries |
| 29 | +> # by station. It combines historical daily data with aggregated current observations |
| 30 | +> # to provide a complete time series from 2013 to present. |
| 31 | +> # |
| 32 | +> # Output: Daily means, minimums, maximums, and totals by weather station |
| 33 | +> # |
| 34 | +> # Data Sources: |
| 35 | +> # 1. Historical daily data (2013 to T-4 days) from AEMET climatological endpoint |
| 36 | +> # 2. Current hourly data (T-4 days to present) aggregated to daily values |
| 37 | +> # |
| 38 | +> # Author: John Palmer |
| 39 | +> # Date: 2025-08-20 |
| 40 | +> |
| 41 | +> rm(list=ls()) |
| 42 | +> |
| 43 | +> # Dependencies #### |
| 44 | +> library(tidyverse) |
| 45 | +── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── |
| 46 | +✔ dplyr 1.1.4 ✔ readr 2.1.5 |
| 47 | +✔ forcats 1.0.0 ✔ stringr 1.5.1 |
| 48 | +✔ ggplot2 3.5.1 ✔ tibble 3.2.1 |
| 49 | +✔ lubridate 1.9.4 ✔ tidyr 1.3.1 |
| 50 | +✔ purrr 1.0.4 |
| 51 | +── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── |
| 52 | +✖ dplyr::filter() masks stats::filter() |
| 53 | +✖ dplyr::lag() masks stats::lag() |
| 54 | +ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors |
| 55 | +> library(lubridate) |
| 56 | +> library(data.table) |
| 57 | + |
| 58 | +Attaching package: ‘data.table’ |
| 59 | + |
| 60 | +The following objects are masked from ‘package:lubridate’: |
| 61 | + |
| 62 | + hour, isoweek, mday, minute, month, quarter, second, wday, week, |
| 63 | + yday, year |
| 64 | + |
| 65 | +The following objects are masked from ‘package:dplyr’: |
| 66 | + |
| 67 | + between, first, last |
| 68 | + |
| 69 | +The following object is masked from ‘package:purrr’: |
| 70 | + |
| 71 | + transpose |
| 72 | + |
| 73 | +> |
| 74 | +> cat("=== DAILY STATION DATA AGGREGATION ===\n") |
| 75 | +=== DAILY STATION DATA AGGREGATION === |
| 76 | +> |
| 77 | +> # Check if expanded hourly data exists |
| 78 | +> if(!file.exists("data/output/hourly_station_ongoing.csv.gz")) { |
| 79 | ++ cat("ERROR: Hourly weather data not found. Run get_latest_data.R first.\n") |
| 80 | ++ quit(save="no", status=1) |
| 81 | ++ } |
| 82 | +> |
| 83 | +> # Load expanded hourly data |
| 84 | +> cat("Loading hourly weather data...\n") |
| 85 | +Loading hourly weather data... |
| 86 | +> hourly_data = fread("data/output/hourly_station_ongoing.csv.gz") |
| 87 | +> hourly_data$fint = as_datetime(hourly_data$fint) |
| 88 | +> hourly_data$date = as.Date(hourly_data$fint) |
| 89 | +> |
| 90 | +> cat("Loaded", nrow(hourly_data), "hourly observation records.\n") |
| 91 | +Loaded 62722 hourly observation records. |
| 92 | +> cat("Date range:", min(hourly_data$date, na.rm=TRUE), "to", max(hourly_data$date, na.rm=TRUE), "\n") |
| 93 | +Date range: 20321 to 20321 |
| 94 | +> |
| 95 | +> # Load historical daily data if it exists |
| 96 | +> historical_daily = NULL |
| 97 | +> if(file.exists("data/output/daily_station_historical.csv.gz")) { |
| 98 | ++ cat("Loading historical daily data...\n") |
| 99 | ++ historical_daily = fread("data/output/daily_station_historical.csv.gz") |
| 100 | ++ |
| 101 | ++ # Standardize historical data format |
| 102 | ++ if("fecha" %in% names(historical_daily)) { |
| 103 | ++ historical_daily$date = as.Date(historical_daily$fecha) |
| 104 | ++ } |
| 105 | ++ |
| 106 | ++ # Select compatible variables and reshape to match hourly format |
| 107 | ++ historical_compatible = historical_daily %>% |
| 108 | ++ filter(!is.na(date)) %>% |
| 109 | ++ select(any_of(c("date", "idema", "ta", "tamax", "tamin", "hr", "prec", "vv", "p"))) %>% |
| 110 | ++ pivot_longer(cols = c(-date, -idema), names_to = "measure", values_to = "value") %>% |
| 111 | ++ filter(!is.na(value)) %>% |
| 112 | ++ mutate(source = "historical_daily") %>% |
| 113 | ++ as.data.table() |
| 114 | ++ |
| 115 | ++ cat("Loaded", nrow(historical_compatible), "historical daily records.\n") |
| 116 | ++ cat("Historical date range:", min(historical_compatible$date, na.rm=TRUE), "to", max(historical_compatible$date, na.rm=TRUE), "\n") |
| 117 | ++ } else { |
| 118 | ++ cat("No historical daily data found. Using only current observations.\n") |
| 119 | ++ historical_compatible = data.table() |
| 120 | ++ } |
| 121 | +No historical daily data found. Using only current observations. |
| 122 | +> |
| 123 | +> # Aggregate hourly data to daily values |
| 124 | +> cat("Aggregating hourly data to daily summaries...\n") |
| 125 | +Aggregating hourly data to daily summaries... |
| 126 | +> |
| 127 | +> # Define aggregation rules for each variable |
| 128 | +> aggregate_hourly_to_daily = function(hourly_dt) { |
| 129 | ++ daily_aggregated = hourly_dt %>% |
| 130 | ++ group_by(date, idema, measure) %>% |
| 131 | ++ summarise( |
| 132 | ++ value = case_when( |
| 133 | ++ measure %in% c("ta", "hr", "vv", "pres") ~ mean(value, na.rm = TRUE), # Mean for these variables |
| 134 | ++ measure %in% c("tamax") ~ max(value, na.rm = TRUE), # Maximum for tamax |
| 135 | ++ measure %in% c("tamin") ~ min(value, na.rm = TRUE), # Minimum for tamin |
| 136 | ++ measure %in% c("prec") ~ sum(value, na.rm = TRUE), # Sum for precipitation |
| 137 | ++ TRUE ~ mean(value, na.rm = TRUE) # Default to mean |
| 138 | ++ ), |
| 139 | ++ n_observations = n(), |
| 140 | ++ source = "hourly_aggregated", |
| 141 | ++ .groups = "drop" |
| 142 | ++ ) %>% |
| 143 | ++ filter(!is.na(value) & !is.infinite(value)) %>% |
| 144 | ++ as.data.table() |
| 145 | ++ |
| 146 | ++ return(daily_aggregated) |
| 147 | ++ } |
| 148 | +> |
| 149 | +> daily_from_hourly = aggregate_hourly_to_daily(hourly_data) |
| 150 | +Warning message: |
| 151 | +Returning more (or less) than 1 row per `summarise()` group was deprecated in |
| 152 | +dplyr 1.1.0. |
| 153 | +ℹ Please use `reframe()` instead. |
| 154 | +ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` |
| 155 | + always returns an ungrouped data frame and adjust accordingly. |
| 156 | +> cat("Created", nrow(daily_from_hourly), "daily aggregated records from hourly data.\n") |
| 157 | +Created 62722 daily aggregated records from hourly data. |
| 158 | +> |
| 159 | +> # Combine historical and aggregated current data |
| 160 | +> if(nrow(historical_compatible) > 0) { |
| 161 | ++ # Find overlap period to avoid duplication |
| 162 | ++ hourly_start_date = min(daily_from_hourly$date, na.rm = TRUE) |
| 163 | ++ historical_end_date = max(historical_compatible$date, na.rm = TRUE) |
| 164 | ++ |
| 165 | ++ cat("Hourly data starts:", hourly_start_date, "\n") |
| 166 | ++ cat("Historical data ends:", historical_end_date, "\n") |
| 167 | ++ |
| 168 | ++ # Use historical data up to the start of hourly data, then use aggregated hourly |
| 169 | ++ if(hourly_start_date <= historical_end_date) { |
| 170 | ++ # Overlap exists - use historical up to day before hourly starts |
| 171 | ++ cutoff_date = hourly_start_date - days(1) |
| 172 | ++ historical_to_use = historical_compatible[date <= cutoff_date] |
| 173 | ++ cat("Using historical data through", cutoff_date, "\n") |
| 174 | ++ } else { |
| 175 | ++ # No overlap - use all historical |
| 176 | ++ historical_to_use = historical_compatible |
| 177 | ++ cat("No overlap - using all historical data.\n") |
| 178 | ++ } |
| 179 | ++ |
| 180 | ++ # Add n_observations column to historical data |
| 181 | ++ historical_to_use$n_observations = 1 |
| 182 | ++ |
| 183 | ++ # Combine datasets |
| 184 | ++ combined_daily = rbind(historical_to_use, daily_from_hourly, fill = TRUE) |
| 185 | ++ } else { |
| 186 | ++ combined_daily = daily_from_hourly |
| 187 | ++ } |
| 188 | +> |
| 189 | +> # Sort and clean the combined dataset |
| 190 | +> combined_daily = combined_daily[order(date, idema, measure)] |
| 191 | +> |
| 192 | +> # Create summary statistics |
| 193 | +> cat("\n=== DAILY AGGREGATION SUMMARY ===\n") |
| 194 | + |
| 195 | +=== DAILY AGGREGATION SUMMARY === |
| 196 | +> cat("Total daily records:", nrow(combined_daily), "\n") |
| 197 | +Total daily records: 62722 |
| 198 | +> cat("Date range:", min(combined_daily$date, na.rm=TRUE), "to", max(combined_daily$date, na.rm=TRUE), "\n") |
| 199 | +Date range: 20321 to 20321 |
| 200 | +> cat("Number of stations:", length(unique(combined_daily$idema)), "\n") |
| 201 | +Number of stations: 836 |
| 202 | +> cat("Variables included:", paste(unique(combined_daily$measure), collapse=", "), "\n") |
| 203 | +Variables included: hr, prec, ta, tamax, tamin, vv, pres |
| 204 | +> |
| 205 | +> # Summary by source |
| 206 | +> source_summary = combined_daily[, .( |
| 207 | ++ records = .N, |
| 208 | ++ stations = length(unique(idema)), |
| 209 | ++ date_min = min(date, na.rm=TRUE), |
| 210 | ++ date_max = max(date, na.rm=TRUE) |
| 211 | ++ ), by = source] |
| 212 | +> |
| 213 | +> print(source_summary) |
| 214 | + source records stations date_min date_max |
| 215 | + <char> <int> <int> <Date> <Date> |
| 216 | +1: hourly_aggregated 62722 836 2025-08-21 2025-08-21 |
| 217 | +> |
| 218 | +> # Check data coverage by variable |
| 219 | +> cat("\n=== VARIABLE COVERAGE ===\n") |
| 220 | + |
| 221 | +=== VARIABLE COVERAGE === |
| 222 | +> variable_coverage = combined_daily[, .( |
| 223 | ++ records = .N, |
| 224 | ++ stations = length(unique(idema)), |
| 225 | ++ date_min = min(date, na.rm=TRUE), |
| 226 | ++ date_max = max(date, na.rm=TRUE), |
| 227 | ++ mean_obs_per_station_day = mean(n_observations, na.rm=TRUE) |
| 228 | ++ ), by = measure] |
| 229 | +> |
| 230 | +> print(variable_coverage) |
| 231 | + measure records stations date_min date_max mean_obs_per_station_day |
| 232 | + <char> <int> <int> <Date> <Date> <num> |
| 233 | +1: hr 10199 827 2025-08-21 2025-08-21 12.44328 |
| 234 | +2: prec 10045 814 2025-08-21 2025-08-21 12.45246 |
| 235 | +3: ta 10178 826 2025-08-21 2025-08-21 12.43624 |
| 236 | +4: tamax 10178 826 2025-08-21 2025-08-21 12.43624 |
| 237 | +5: tamin 10178 826 2025-08-21 2025-08-21 12.43624 |
| 238 | +6: vv 9001 724 2025-08-21 2025-08-21 12.54538 |
| 239 | +7: pres 2943 229 2025-08-21 2025-08-21 12.91098 |
| 240 | +> |
| 241 | +> # Save the aggregated daily data |
| 242 | +> output_file = "data/output/daily_station_aggregated.csv.gz" |
| 243 | +> fwrite(combined_daily, output_file) |
| 244 | +> |
| 245 | +> cat("\n=== AGGREGATION COMPLETE ===\n") |
| 246 | + |
| 247 | +=== AGGREGATION COMPLETE === |
| 248 | +> cat("Daily aggregated data saved to:", output_file, "\n") |
| 249 | +Daily aggregated data saved to: data/output/daily_station_aggregated.csv.gz |
| 250 | +> cat("File size:", round(file.size(output_file)/1024/1024, 1), "MB\n") |
| 251 | +File size: 0 MB |
| 252 | +> |
| 253 | +> proc.time() |
| 254 | + user system elapsed |
| 255 | + 1.867 0.125 1.751 |
0 commit comments