OR-Dept-Environmental-Quality
diff --git a/‎pipelines/aqs/aqi_daily_transform_run.py‎
Lines changed: 72 additions & 0 deletions b/‎pipelines/aqs/aqi_daily_transform_run.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎pipelines/aqs/monitors_run.py‎
Lines changed: 54 additions & 0 deletions b/‎pipelines/aqs/monitors_run.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎pipelines/aqs/monitors_transform_run.py‎
Lines changed: 69 additions & 0 deletions b/‎pipelines/aqs/monitors_transform_run.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎src/aqs/extractors/monitors.py‎
Lines changed: 61 additions & 8 deletions b/‎src/aqs/extractors/monitors.py‎
Lines changed: 61 additions & 8 deletions
@@ -0,0 +1,72 @@
+"""Pipeline for transforming AQI daily data.
+
+This pipeline reads raw AQI daily summary files, combines all pollutants
+for each year, applies transformations, and writes cleaned data to the
+transform layer organized by year.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from datetime import date
+
+import pandas as pd
+
+# Add src directory to Python path
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT / "src"))
+
+import config
+from aqs.transformers.aqi_daily import transform_aqi_daily_for_year
+from loaders.filesystem import write_csv
+
+
+def run():
+    """Run the AQI daily transformation pipeline."""
+    print("🚀 Starting AQI Daily Transformation Pipeline")
+    print(f"📅 Date: {date.today()}")
+
+    # Setup
+    config.ensure_dirs()
+
+    # Input directory (raw daily data)
+    raw_daily_dir = config.RAW_DAILY
+
+    if not raw_daily_dir.exists():
+        print(f"❌ Raw daily directory not found: {raw_daily_dir}")
+        print("   Please run the daily extraction pipeline first.")
+        return
+
+    # Output directory
+    output_dir = config.ROOT / "transform" / "aqi"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Process each year in the date range
+    years_processed = 0
+    total_records = 0
+
+    for year in range(config.START_YEAR, config.END_YEAR + 1):
+        year_str = str(year)
+        print(f"\n📅 Processing year {year_str}...")
+
+        # Transform data for this year
+        transformed_df = transform_aqi_daily_for_year(year_str, raw_daily_dir)
+
+        if transformed_df.empty:
+            print(f"⚠️  No data for year {year_str}, skipping")
+            continue
+
+        # Write to transform layer
+        output_path = output_dir / f"aqi_aqs_daily_{year_str}.csv"
+        write_csv(transformed_df, output_path)
+
+        print(f"✅ Wrote {len(transformed_df)} AQI records to {output_path}")
+
+        years_processed += 1
+        total_records += len(transformed_df)
+
+    print("\n🎉 AQI daily transformation complete!")
+    print(f"📊 Processed {years_processed} years with {total_records} total records")
+
+
+if __name__ == "__main__":
+    run()
@@ -0,0 +1,54 @@
+"""Pipeline for extracting all Oregon monitors metadata from AQS API.
+
+This pipeline fetches monitor metadata for all relevant parameters in Oregon
+from 2005-2025, deduplicates by site, and writes the results to the data lake.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from datetime import date
+
+# Add src directory to Python path
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT / "src"))
+
+import config
+from aqs.extractors.monitors import fetch_all_monitors_for_oregon
+from loaders.filesystem import write_csv
+
+
+def run():
+    """Run the Oregon monitors extraction pipeline."""
+    print("🚀 Starting Oregon Monitors Extraction Pipeline")
+    print(f"📅 Date: {date.today()}")
+    
+    # Setup
+    config.ensure_dirs()
+    config.set_aqs_credentials()
+    
+    # Date range
+    bdate = date(2005, 1, 1)
+    edate = date(2025, 12, 31)
+    print(f"📅 Processing monitors from {bdate} to {edate}")
+    
+    # Fetch monitors
+    print("\n📡 Fetching monitor metadata from AQS API...")
+    monitors_df = fetch_all_monitors_for_oregon(bdate, edate)
+    
+    if monitors_df.empty:
+        print("❌ No monitors found")
+        return
+    
+    # Write to data lake
+    output_dir = config.ROOT / "raw" / "aqs"/"monitors"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / "oregon_monitors_2005_2025.csv"
+    
+    write_csv(monitors_df, output_path)
+    print(f"\n✅ Wrote {len(monitors_df)} unique monitor records to {output_path}")
+    
+    print("\n🎉 Monitors extraction complete!")
+
+
+if __name__ == "__main__":
+    run()
@@ -0,0 +1,69 @@
+"""Pipeline for transforming Oregon monitors metadata.
+
+This pipeline reads raw monitor metadata, selects specific fields,
+removes duplicates by site_code, and writes cleaned monitor records
+to the transform layer.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from datetime import date
+
+import pandas as pd
+
+# Add src directory to Python path
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT / "src"))
+
+import config
+from aqs.transformers.monitors import transform_monitors
+from loaders.filesystem import write_csv
+
+
+def run():
+    """Run the Oregon monitors transformation pipeline."""
+    print("🚀 Starting Oregon Monitors Transformation Pipeline")
+    print(f"📅 Date: {date.today()}")
+
+    # Setup
+    config.ensure_dirs()
+
+    # Input path (raw monitors)
+    input_path = config.ROOT / "raw" / "aqs" / "monitors" / "oregon_monitors_2005_2025.csv"
+
+    if not input_path.exists():
+        print(f"❌ Raw monitors file not found: {input_path}")
+        print("   Please run the monitors extraction pipeline first.")
+        return
+
+    # Read raw monitors data
+    print(f"\n📖 Reading raw monitors from {input_path}")
+    raw_monitors_df = pd.read_csv(input_path)
+
+    if raw_monitors_df.empty:
+        print("❌ No raw monitors data found")
+        return
+
+    print(f"📊 Loaded {len(raw_monitors_df)} raw monitor records")
+
+    # Transform monitors
+    print("\n🔄 Transforming monitor data...")
+    transformed_df = transform_monitors(raw_monitors_df)
+
+    if transformed_df.empty:
+        print("❌ Transformation resulted in empty dataset")
+        return
+
+    # Write to transform layer
+    output_dir = config.ROOT / "transform" / "monitors"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / "aqs_monitors.csv"
+
+    write_csv(transformed_df, output_path)
+    print(f"\n✅ Wrote {len(transformed_df)} transformed monitor records to {output_path}")
+
+    print("\n🎉 Monitors transformation complete!")
+
+
+if __name__ == "__main__":
+    run()
@@ -18,12 +18,15 @@
 
 import json
 from urllib.parse import urlencode
+from concurrent.futures import ThreadPoolExecutor
+import threading
 
 import pandas as pd
 import requests
 
 from pyaqsapi import bystate
 import config
+from aqs import _client
 
 
 def _ensure_dataframe(payload: object) -> Optional[pd.DataFrame]:
@@ -139,7 +142,7 @@ def fetch_samples_for_parameter(parameter_code: str, bdate: date, edate: date, s
 
     samples_frames: List[pd.DataFrame] = []
     # Expect monitors to contain state_code, county_code, site_number columns
-    unique_sites = monitors[["state_code", "county_code", "site_number"]].drop_duplicates()
+    unique_sites = monitors[["state_code", "county_code", "site_number", "parameter_code"]].drop_duplicates()
     for _, row in unique_sites.iterrows():
         state = str(row["state_code"]).zfill(2)
         county = str(row["county_code"]).zfill(3)
@@ -164,12 +167,62 @@ def fetch_samples_for_parameter(parameter_code: str, bdate: date, edate: date, s
     return pd.concat(samples_frames, ignore_index=True)
 
 
-def load_parameters_csv(path: str = "ops/parameters.csv") -> List[str]:
-    """Load the `AQS_Parameter` column from the parameters CSV as strings.
 
-    Returns a list of parameter codes suitable for looping in the pipeline.
+
+def fetch_all_monitors_for_oregon(bdate: date, edate: date) -> pd.DataFrame:
+    """Fetch all unique monitor metadata for Oregon (state 41) from 2005-2025.
+    
+    Uses hardcoded parameter codes, fetches monitors for each,
+    concatenates results, and deduplicates by site to ensure one entry per monitor
+    (~200 total unique sites), regardless of parameter coverage.
     """
-    df = pd.read_csv(path, dtype={"AQS_Parameter": str})
-    if "AQS_Parameter" not in df.columns:
-        raise KeyError("parameters.csv must contain an 'AQS_Parameter' column")
-    return df["AQS_Parameter"].dropna().astype(str).tolist()
+    # Check circuit breaker
+    if _client.circuit_is_open():
+        raise RuntimeError("AQS circuit is open; cannot fetch monitors")
+    
+    # Hardcoded parameter codes
+    parameter_codes = ["44201", "88101", "88502", "85103", "85110", "85128", "17141", "43817", "43804", "45201", "43509", "43503", "14115", "17242"]
+    
+    print(f"📋 Processing {len(parameter_codes)} parameters for monitors...")
+    
+    # Fetch monitors for each parameter concurrently (limit workers to avoid API overload)
+    all_monitors: List[pd.DataFrame] = []
+    
+    def fetch_for_param(code: str) -> pd.DataFrame:
+        print(f"  📡 Fetching monitors for parameter {code}...")
+        monitors = fetch_monitors([code], bdate, edate, "41")  # Oregon FIPS
+        if not monitors.empty:
+            print(f"  ✅ Found {len(monitors)} monitors for {code}")
+        else:
+            print(f"  ⚠️  No monitors found for {code}")
+        return monitors
+    
+    with ThreadPoolExecutor(max_workers=4) as executor:  # Limit to 4 concurrent requests
+        futures = [executor.submit(fetch_for_param, code) for code in parameter_codes]
+        for future in futures:
+            df = future.result()
+            if not df.empty:
+                all_monitors.append(df)
+    
+    # Concatenate and deduplicate by site (one entry per monitor)
+    if not all_monitors:
+        print("❌ No monitors found for any parameter")
+        return pd.DataFrame()
+    
+    print("🔄 Concatenating and deduplicating monitor data...")
+    combined = pd.concat(all_monitors, ignore_index=True)
+    original_count = len(combined)
+    
+    # Create site_code: state_code + county_code + site_number
+    combined["site_code"] = (
+        combined["state_code"].astype(str).str.zfill(2) +
+        combined["county_code"].astype(str).str.zfill(3) +
+        combined["site_number"].astype(str).str.zfill(4)
+    )
+    
+    # Deduplicate by site + parameter (state_code, county_code, site_number, parameter_code)
+    combined = combined.drop_duplicates(subset=["state_code", "county_code", "site_number", "parameter_code"])
+    deduped_count = len(combined)
+    
+    print(f"✅ Deduplicated: {original_count} raw entries → {deduped_count} unique monitor-parameter combinations")
+    return combined