reduce logging noise (#141)

victoriacheng15 · web-flow · commit dae3ca0c8bb5 · 2025-12-05T09:13:59.000-07:00
* reduce logging noise

* update docs on logging related details
diff --git a/docs/extraction_architecture.md b/docs/extraction_architecture.md
@@ -192,34 +192,44 @@ flowchart TD
 The pipeline implements **fault-tolerant design** common in production data systems:
 
 1. **Provider-level fault isolation** - Individual provider failures don't cascade; pipeline continues processing other providers
-2. **Extraction error boundaries** - Caught by `@extractor_error_handler` decorator; malformed data is logged but doesn't block the pipeline
-3. **Network resilience** - httpx timeout management (30 second timeout) prevents hanging; timeouts are logged as provider failures
-4. **API error handling** - Google Sheets API errors are logged with full context; transient failures can be retried by re-running the pipeline
+2. **Extraction error boundaries** - Caught by `@extractor_error_handler` decorator; malformed data is silently skipped without blocking the pipeline
+3. **Network resilience** - httpx timeout management (30 second timeout) prevents hanging; HTTP errors silently return None for graceful degradation
+4. **API error handling** - Google Sheets API errors are logged with full context at the application level; transient failures can be retried by re-running the pipeline
+5. **Graceful degradation** - Failed articles are silently skipped (exception caught), allowing the pipeline to process successfully extracted articles
 
 **Idempotent Operations**:
 
 - Deduplication check ensures reruns don't insert duplicates
 - Timestamp updates are overwritten (safe for retries)
 - Sheet sorting is deterministic
 
-All errors are written to stdout for operational visibility (captured in GitHub Actions logs or Docker containers).
+**Logging Strategy**:
+
+- High-level events (provider processing, batch writes) logged in `main.py`
+- Low-level errors (HTTP failures, extraction errors) handled silently in utility modules
+- This reduces log noise while maintaining operational visibility at the application level
 
 ## Logging & Observability
 
 Structured logging enables operational visibility:
 
 - **Level**: INFO (production-grade)
 - **Format**: `%(asctime)s - %(name)s - %(levelname)s - %(message)s`
+- **Date Format**: `%Y-%m-%d %H:%M:%S` (without milliseconds)
 - **Output**: stdout (captured by GitHub Actions logs and Docker)
+- **httpx Logging**: Suppressed to CRITICAL level to reduce noise from HTTP requests
+- **Centralized Setup**: All logging configured in `main.py` for consistency
 
 **Key Log Messages** (Observable Events):
 
-- "Processed {provider}: X new articles found" - Success metric
-- "Failed to fetch page for {provider}" - Network issue indicator
-- "Error processing {provider}: {error}" - Provider-specific failures
-- "Unknown provider: {provider}" - Configuration issue
+- "Processing {provider_url} - X new articles found" - Success metric per provider
+- "Failed to fetch page for {provider_name} from {provider_url}" - Network issue indicator
+- "Error processing {provider_name}: {error}" - Provider-specific failures
+- "Unknown provider: {provider_name}" - Configuration issue
+- "Batch write complete: X articles added to the sheet." - Load completion metric
+- "✅ No new articles found" - No-op scenario indicator
 
-These logs enable downstream monitoring, alerting, and audit trails—essential for operational pipelines.
+These logs enable downstream monitoring, alerting, and audit trails—essential for operational pipelines. Utility modules (`get_page.py`, `extractors.py`) delegate logging to `main.py` for a unified view.
 
 ## Performance & Architecture
 
@@ -228,6 +238,7 @@ These logs enable downstream monitoring, alerting, and audit trails—essential
 - **Sequential processing** - Providers processed one at a time; can be parallelized if needed
 - **Generator-based streaming** - Articles flow through pipeline immediately after extraction (no batch buffering)
 - **Memory efficient** - Generators enable incremental processing without storing all articles in memory
+- **Centralized logging** - Single logging source in `main.py` provides unified observability across all pipeline stages
 
 ### Rate Limiting & Respect
 
diff --git a/script/main.py b/script/main.py
@@ -29,8 +29,10 @@
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
     stream=sys.stdout,
 )
+logging.getLogger("httpx").setLevel(logging.CRITICAL)
 
 
 async def process_provider(fetcher_state, provider, existing_titles):
@@ -64,7 +66,7 @@ async def process_provider(fetcher_state, provider, existing_titles):
             get_articles(elements, handler["extractor"], existing_titles)
         )
         logger.info(
-            f"Processed {provider_name}: {len(articles_found)} new articles found"
+            f"Processing {provider_url} - {len(articles_found)} new articles found"
         )
         return articles_found, fetcher_state
 
@@ -92,11 +94,9 @@ async def async_main(timestamp):
 
     # Batch write all articles at once
     if all_articles:
-        batch_start = time.time()
         batch_append_articles(articles_sheet, all_articles)
-        batch_time = time.time() - batch_start
         logger.info(
-            f"Batch write complete: {len(all_articles)} articles written in {batch_time:.2f}s"
+            f"Batch write complete: {len(all_articles)} articles added to the sheet."
         )
     else:
         logger.info("\n✅ No new articles found\n")
diff --git a/script/utils/extractors.py b/script/utils/extractors.py
@@ -1,18 +1,11 @@
 import re
 import logging
-import sys
 import traceback
 from datetime import datetime
 from utils.format_date import clean_and_convert_date
 
 
 logger = logging.getLogger(__name__)
-# Configure logging to write to stdout for log file capture
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    stream=sys.stdout,
-)
 
 
 # Error handling decorator for extractors
@@ -152,7 +145,7 @@ def get_articles(elements, extract_func, existing_titles):
             if normalized_title not in normalized_existing_titles:
                 yield article_info
         except Exception as e:
-            logger.error(f"Skipping an article due to error: {e}")
+            pass
 
 
 def provider_dict(provider_element):
diff --git a/script/utils/get_page.py b/script/utils/get_page.py
@@ -1,18 +1,11 @@
 import httpx
 import asyncio
 import logging
-import sys
 import time
 from bs4 import BeautifulSoup
 from .constants import DEFAULT_REQUEST_INTERVAL, DEFAULT_TIMEOUT
 
 logger = logging.getLogger(__name__)
-# Configure logging to write to stdout for log file capture
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    stream=sys.stdout,
-)
 
 
 def init_fetcher_state():
@@ -52,11 +45,9 @@ async def fetch_page(state, url):
             soup = BeautifulSoup(response.text, "html.parser")
             return soup, state
 
-        logger.error(f"HTTP {response.status_code} from {url}")
         return None, state
 
     except Exception as e:
-        logger.error(f"Error fetching {url}: {str(e)}")
         return None, state
 
 
diff --git a/script/utils/sheet.py b/script/utils/sheet.py
@@ -76,9 +76,7 @@ def get_all_providers(providers_sheet: Worksheet) -> List[Dict[str, Any]]:
     return providers_sheet.get_all_records()
 
 
-def batch_append_articles(
-    sheet: Worksheet, articles: List[tuple], log_func: Callable = print
-) -> None:
+def batch_append_articles(sheet: Worksheet, articles: List[tuple]) -> None:
     """
     Appends multiple article rows to the given sheet in a single batch operation.
 
@@ -90,13 +88,6 @@ def batch_append_articles(
     if not articles:
         return
 
-    # Log all articles
-    for article_info in articles:
-        date = article_info[0]
-        title = article_info[1]
-        link = article_info[2]
-        log_func(f"==> {title} - {date}\n{link}\n")
-
     # Batch append all rows at once
     rows = [list(article) for article in articles]
     sheet.append_rows(rows)