Skip to content

Commit e98f52a

Browse files
committed
Address code review feedback: improve error handling and documentation
- Implement fail-fast error handling for configuration errors - Distinguish between critical config errors (fail) and network issues (use defaults) - Add detailed error logging with stack traces for debugging - Document new crawler settings in .env.example - Add inline comments explaining safe defaults Critical configuration errors (ValueError, KeyError, TypeError) now fail fast as per alpha principles, while transient errors still fall back to safe defaults with prominent error logging.
1 parent aab0721 commit e98f52a

File tree

3 files changed

+22
-7
lines changed

3 files changed

+22
-7
lines changed

.env.example

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,9 @@ EMBEDDING_DIMENSIONS=1536
3333
# - OPENAI_API_KEY (encrypted)
3434
# - MODEL_CHOICE
3535
# - TRANSPORT settings
36-
# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.)
36+
# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.)
37+
# - Crawler settings:
38+
# * CRAWL_MAX_CONCURRENT (default: 10) - Max concurrent pages per crawl operation
39+
# * CRAWL_BATCH_SIZE (default: 50) - URLs processed per batch
40+
# * MEMORY_THRESHOLD_PERCENT (default: 80) - Memory % before throttling
41+
# * DISPATCHER_CHECK_INTERVAL (default: 0.5) - Memory check interval in seconds

python/src/server/services/crawling/strategies/batch.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,24 @@ async def crawl_batch_with_progress(
5959
await progress_callback("error", 0, "Crawler not available")
6060
return []
6161

62-
# Load settings from database first
62+
# Load settings from database - fail fast on configuration errors
6363
try:
6464
settings = await credential_service.get_credentials_by_category("rag_strategy")
6565
batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
6666
if max_concurrent is None:
6767
max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
6868
memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
6969
check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
70+
except (ValueError, KeyError, TypeError) as e:
71+
# Critical configuration errors should fail fast in alpha
72+
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
73+
raise ValueError(f"Failed to load crawler configuration: {e}")
7074
except Exception as e:
71-
logger.warning(f"Failed to load crawl settings: {e}, using defaults")
75+
# For non-critical errors (e.g., network issues), use defaults but log prominently
76+
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
7277
batch_size = 50
7378
if max_concurrent is None:
74-
max_concurrent = 10
79+
max_concurrent = 10 # Safe default to prevent memory issues
7580
memory_threshold = 80.0
7681
check_interval = 0.5
7782
settings = {} # Empty dict for defaults

python/src/server/services/crawling/strategies/recursive.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,19 +61,24 @@ async def crawl_recursive_with_progress(
6161
await progress_callback('error', 0, 'Crawler not available')
6262
return []
6363

64-
# Load settings from database
64+
# Load settings from database - fail fast on configuration errors
6565
try:
6666
settings = await credential_service.get_credentials_by_category("rag_strategy")
6767
batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
6868
if max_concurrent is None:
6969
max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
7070
memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
7171
check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
72+
except (ValueError, KeyError, TypeError) as e:
73+
# Critical configuration errors should fail fast in alpha
74+
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
75+
raise ValueError(f"Failed to load crawler configuration: {e}")
7276
except Exception as e:
73-
logger.warning(f"Failed to load crawl settings: {e}, using defaults")
77+
# For non-critical errors (e.g., network issues), use defaults but log prominently
78+
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
7479
batch_size = 50
7580
if max_concurrent is None:
76-
max_concurrent = 10
81+
max_concurrent = 10 # Safe default to prevent memory issues
7782
memory_threshold = 80.0
7883
check_interval = 0.5
7984
settings = {} # Empty dict for defaults

0 commit comments

Comments
 (0)