site-crawler/.env.example at main · CarlasHub/site-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Optional, default is 8080 locally. Cloud Run sets PORT automatically.
PORT=8080

# Runtime environment and service identity.
SERVICE_NAME=cat-crawler-backend

# Public environment label for release/config generation.
# Supported values: local, staging, production
APP_ENV=local

# Public app origin used by the bookmarklet iframe.
# Local defaults to http://localhost:8080 if omitted.
BOOKMARKLET_APP_ORIGIN=http://localhost:8080

# Trusted proxy boundary for Express req.ip handling.
# Leave unset or false for direct local development with no reverse proxy.
# For a local reverse proxy on the same host, use: TRUST_PROXY=loopback
# For production behind a known ingress, set explicit trusted proxy IP/CIDR values, for example:
# TRUST_PROXY=10.0.0.0/8,192.168.0.0/16
# Do not use TRUST_PROXY=true.
TRUST_PROXY=false

# Logging and runtime safety.
LOG_LEVEL=info
LOG_HEALTHCHECKS=false
ENABLE_REQUEST_LOGGING=true
EXPOSE_INTERNAL_ERRORS=false
EXIT_ON_UNHANDLED_ERROR=true
GRACEFUL_SHUTDOWN_TIMEOUT_MS=10000

# Request body size limit for JSON API requests.
API_BODY_LIMIT_BYTES=65536

# Per-IP rate limits.
RATE_LIMIT_CRAWL_START_WINDOW_MS=900000
RATE_LIMIT_CRAWL_START_MAX=10
RATE_LIMIT_CRAWL_STATUS_WINDOW_MS=60000
RATE_LIMIT_CRAWL_STATUS_MAX=120
RATE_LIMIT_CRAWL_RESULTS_WINDOW_MS=60000
RATE_LIMIT_CRAWL_RESULTS_MAX=30

# Crawl defaults and hard caps.
CRAWL_DEFAULT_MAX_PAGES=300
CRAWL_MAX_PAGES=300
CRAWL_DEFAULT_CONCURRENCY=6
CRAWL_MAX_CONCURRENCY=6
CRAWL_DEFAULT_TIMEOUT_MS=12000
CRAWL_MAX_TIMEOUT_MS=30000
# Hard cap for this project: max 2 concurrent active crawl jobs.
CRAWL_MAX_ACTIVE_JOBS=2
CRAWL_MAX_QUEUED_JOBS=20
CRAWL_MAX_REDIRECTS=10
CRAWL_MAX_HTML_BYTES=2097152
CRAWL_MAX_SITEMAP_BYTES=5242880
CRAWL_MAX_ROBOTS_BYTES=524288

# Durable background crawl job state.
# Local development defaults to JOB_STATE_BACKEND=file.
# Staging and production must use JOB_STATE_BACKEND=firestore.
JOB_STATE_BACKEND=file

# Local file-backed job state path for development and tests.
JOB_STATE_FILE=./backend/.data/crawl-jobs.json

# Shared Firestore collection for job metadata, queue entries, and result chunks.
FIRESTORE_CRAWL_JOBS_COLLECTION=crawlJobs

# Running-job lease and recovery timings.
CRAWL_JOB_LEASE_MS=30000
CRAWL_JOB_HEARTBEAT_MS=10000
CRAWL_JOB_DISPATCH_INTERVAL_MS=5000
CRAWL_JOB_TTL_MS=1800000