fixed all scripts for getting data

JohnPalmer · JohnPalmer · commit ffed63828f35 · 2025-08-24T02:45:30.000+02:00
diff --git a/code/get_forecast_data.R b/code/get_forecast_data.R
@@ -11,7 +11,11 @@ library(lubridate)
 # Load API keys
 source("auth/keys.R")
 
-cat("=== AEMET FORECAST DATA COLLECTION (SIMPLE v2) ===\n")
+# Set testing mode to TRUE and specify N_TEST_MUNICIPALITIES to get forecase for only selected municipalities
+TESTING_MODE = TRUE
+N_TEST_MUNICIPALITIES = 2
+
+cat("=== AEMET FORECAST DATA COLLECTION ===\n")
 cat("Started at:", format(Sys.time()), "\n")
 
 # Function to get municipality forecast using working pattern
@@ -132,12 +136,13 @@ cat("Loading municipality codes...\n")
 municipalities_data = fread("data/input/municipalities.csv.gz")
 cat("Loaded", nrow(municipalities_data), "municipalities\n")
 
-# Use small sample for testing
-SAMPLE_SIZE = 2
-working_municipalities = head(municipalities_data$CUMUN, SAMPLE_SIZE)
-names(working_municipalities) = head(municipalities_data$NAMEUNIT, SAMPLE_SIZE)
+working_municipalities = municipalities_data$CUMUN
+names(working_municipalities) = municipalities_data$NAMEUNIT
 
-cat("Testing with", SAMPLE_SIZE, "municipalities\n\n")
+if(TESTING_MODE){
+  working_municipalities = head(working_municipalities, N_TEST_MUNICIPALITIES)
+  cat("Testing with", N_TEST_MUNICIPALITIES, "municipalities\n\n")
+}
 
 # Collect forecasts
 all_forecasts = list()
diff --git a/code/get_historical_data.R b/code/get_historical_data.R
@@ -45,6 +45,10 @@ library(data.table)
 library(curl)
 library(jsonlite)
 
+# Set output data file path
+output_data_file_path = "data/output/daily_station_historical.csv.gz"
+
+
 # If you want to prevent concurrent runs of this script, set PREVENT_CONCURRENT_RUNS to TRUE.
 PREVENT_CONCURRENT_RUNS = FALSE
 
@@ -76,15 +80,15 @@ start_date = as_date("2013-07-01")
 
 # Set up curl handle with API key for authentication and increased timeout
 h <- new_handle()
-handle_setheaders(h, 'api_key' = my_api_key)
+handle_setheaders(h, 'api_key' = get_current_api_key())
 handle_setopt(h, timeout = 60, connecttimeout = 30)  # Increase timeout values
 
 # Generate sequence of all dates to check (from start_date to 4 days before today)
 all_dates = seq.Date(from = start_date, to=today()-4, by = "day")
 
 # Load existing historical weather data
-if(file.exists("data/output/daily_station_historical.csv.gz")){
-stored_weather_daily = fread("data/output/daily_station_historical.csv.gz")
+if(file.exists(output_data_file_path)){
+stored_weather_daily = fread(output_data_file_path)
 } else{stored_weather_daily = NULL}
 
 
@@ -118,6 +122,22 @@ lapply(seq(1, length(these_dates), chunksize), function(j){
         # Request historical daily climatological data for specific date
         req = curl_fetch_memory(paste0('https://opendata.aemet.es/opendata/api/valores/climatologicos/diarios/datos/fechaini/', start_date, 'T00%3A00%3A00UTC/fechafin/', start_date, 'T23%3A59%3A59UTC/todasestaciones'), handle=h)
         
+        if(req$status_code == 429) {
+          cat("Rate limit - rotating key...\n")
+          rotate_api_key()
+          handle_setheaders(h, 'api_key' = get_current_api_key())
+          Sys.sleep(3)
+          req = curl_fetch_memory(paste0('https://opendata.aemet.es/opendata/api/valores/climatologicos/diarios/datos/fechaini/', start_date, 'T00%3A00%3A00UTC/fechafin/', start_date, 'T23%3A59%3A59UTC/todasestaciones'), handle=h)
+          
+        }
+        
+        if(req$status_code != 200) {
+          cat("API request failed:", req$status_code, "\n")
+          return(NULL)
+        }
+        
+          
+        
         wurl = fromJSON(rawToChar(req$content))$datos
         
         req = curl_fetch_memory(wurl)
@@ -162,7 +182,9 @@ lapply(seq(1, length(these_dates), chunksize), function(j){
       },
       error = function(e){ 
         cat("ERROR on date", as.character(start_date), ":", e$message, "\n")
-        Sys.sleep(60)  # Longer sleep on error
+        rotate_api_key()
+        handle_setheaders(h, 'api_key' = get_current_api_key())
+        Sys.sleep(3)
         return(NULL)
       },
       warning = function(w){
@@ -179,8 +201,8 @@ lapply(seq(1, length(these_dates), chunksize), function(j){
   
   print(paste0("Just grabbed ", nrow(weather_daily), " new records"))
   
-  if(file.exists("data/output/daily_station_historical.csv.gz")){
-    stored_weather_daily = fread("data/output/daily_station_historical.csv.gz")
+  if(file.exists(output_data_file_path)){
+    stored_weather_daily = fread(output_data_file_path)
     
     print(paste0("We already had ", nrow(stored_weather_daily), " records stored"))
     
@@ -191,8 +213,8 @@ lapply(seq(1, length(these_dates), chunksize), function(j){
    
    fwrite(weather_daily, "data/output/daily_station_historical.csv.gz")
    
-   print("pausing 60 seconds")
-   Sys.sleep(60)  # Increased pause between chunks
+#   print("pausing 60 seconds")
+#   Sys.sleep(60)  # Increased pause between chunks
    
  })
  
diff --git a/code/get_latest_data.R b/code/get_latest_data.R
@@ -46,6 +46,9 @@ library(R.utils)
 # Set locale to UTF-8 for proper encoding handling
 Sys.setlocale("LC_ALL", "en_US.UTF-8")
 
+# Set output data file path
+output_data_file_path = "data/output/hourly_station_ongoing.csv.gz"
+
 # If you want to prevent concurrent runs of this script, set PREVENT_CONCURRENT_RUNS to TRUE.
 PREVENT_CONCURRENT_RUNS = FALSE
 
@@ -143,20 +146,21 @@ if(!is.null(wdia) && nrow(wdia) > 0){
   print(paste0("Downloaded ", nrow(latest_weather), " new rows of data with 7 core variables."))
 
   # Load previous weather data
-  if(file.exists("data/spain_weather_expanded.csv.gz")) {
-    previous_weather = fread("data/spain_weather_expanded.csv.gz")
+  if(file.exists(output_data_file_path)) {
+    previous_weather = fread(output_data_file_path)
+    print(paste0("Previous dataset file has ", nrow(previous_weather), " rows."))
   } else {
     previous_weather = data.table()
     print("Creating new expanded weather dataset file.")
   }
 
   # Combine and deduplicate
-  spain_weather = bind_rows(latest_weather, previous_weather) %>% 
+  spain_weather = bind_rows(latest_weather, previous_weather) %>% filter(!is.na(value)) %>% 
     distinct() %>%
     arrange(desc(fint))
 
   # Save updated data
-  fwrite(as.data.table(spain_weather), "data/output/hourly_station_ongoing.csv.gz")
+  fwrite(spain_weather, output_data_file_path)
   
   print(paste0("Total dataset now contains ", nrow(spain_weather), " rows."))
 } else{