Skip to content

Commit d1ab2c4

Browse files
authored
Merge pull request #92 from OpenMined/fix/improve-500-exception-logging
Improve logging with log rotation and configurable log level
2 parents be9bc3e + 0e1faf2 commit d1ab2c4

15 files changed

Lines changed: 76 additions & 27 deletions

File tree

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ ENV PATH="/app/.venv/bin:$PATH" \
8282
PYTHONUNBUFFERED=1 \
8383
SYFT_PORT=8080 \
8484
SYFT_SQLITE_DB_PATH=/data/app.db \
85+
SYFT_LOG_FILE=/data/logs/syft-space-server.log \
8586
DOCKER_NETWORK_HOST=host.docker.internal
8687

8788
EXPOSE 8080

backend/syft_space/components/datasets/handlers.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ async def _ensure_provisioner_running(
121121
state=new_state_dict,
122122
)
123123
except Exception as e:
124-
# Error already logged by provisioner, just update state and raise
124+
logger.exception(f"Failed to start provisioner for '{dtype}': {e}")
125125
await self.provisioner_state_repository.upsert_status(
126126
dtype=dtype,
127127
status=ProvisionerStatus.ERROR,
@@ -181,7 +181,7 @@ async def _stop_provisioner(self, dtype: str) -> None:
181181
status=ProvisionerStatus.STOPPED,
182182
)
183183
except Exception as e:
184-
logger.error(f"Failed to stop provisioner for '{dtype}': {e}")
184+
logger.exception(f"Failed to stop provisioner for '{dtype}': {e}")
185185
await self.provisioner_state_repository.upsert_status(
186186
dtype=dtype,
187187
status=ProvisionerStatus.ERROR,
@@ -273,7 +273,7 @@ async def shutdown_all_provisioners(self) -> None:
273273
try:
274274
await self._stop_provisioner(state.dtype)
275275
except Exception as e:
276-
logger.error(f"Failed to stop provisioner '{state.dtype}': {e}")
276+
logger.exception(f"Failed to stop provisioner '{state.dtype}': {e}")
277277
# Continue with other provisioners - best effort shutdown
278278

279279
# ============== Dataset Type Methods ==============
@@ -533,7 +533,7 @@ async def delete_dataset(self, name: str, tenant: Tenant) -> dict:
533533
f"Dataset type '{dataset.dtype}' not registered, skipping resource cleanup"
534534
)
535535
except Exception as e:
536-
logger.error(f"Failed to clean up resources for dataset '{name}': {e}")
536+
logger.exception(f"Failed to clean up resources for dataset '{name}': {e}")
537537

538538
deleted = await self.repository.delete_by_name(name, tenant.id)
539539
if not deleted:

backend/syft_space/components/datasets/provisioner_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ async def _startup_provisioners_background(self) -> None:
4848
await self.dataset_handler.startup_all_provisioners()
4949
logger.info("Provisioner startup complete")
5050
except Exception as e:
51-
logger.error(f"Error during provisioner startup: {e}")
51+
logger.exception(f"Error during provisioner startup: {e}")
5252
finally:
5353
# Signal that provisioner startup is done (success or failure)
5454
# This allows ingestion to proceed even if provisioners failed
@@ -65,4 +65,4 @@ async def shutdown(self) -> None:
6565
await self.dataset_handler.shutdown_all_provisioners()
6666
logger.info("Provisioner shutdown complete")
6767
except Exception as e:
68-
logger.error(f"Error during provisioner shutdown: {e}")
68+
logger.exception(f"Error during provisioner shutdown: {e}")

backend/syft_space/components/endpoints/handlers.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ async def _search_dataset(
406406
try:
407407
search_result = await dataset_instance.search(ctx, query, search_params)
408408
except Exception as e:
409+
logger.exception(f"Dataset search failed: {e}")
409410
raise HTTPException(
410411
status_code=500, detail=f"Dataset search failed: {str(e)}"
411412
) from e
@@ -451,12 +452,16 @@ async def _chat_with_model(
451452
endpoint.model_id, endpoint.tenant_id
452453
)
453454
if not model:
455+
logger.error(
456+
f"Model not found: model_id={endpoint.model_id}, tenant_id={endpoint.tenant_id}"
457+
)
454458
raise HTTPException(status_code=500, detail="Model not found")
455459

456460
# Get model type
457461
try:
458462
model_type_cls = self.model_registry.get_model_type(model.dtype)
459463
except KeyError:
464+
logger.exception(f"Model type '{model.dtype}' not registered")
460465
raise HTTPException(
461466
status_code=500, detail=f"Model type '{model.dtype}' not registered"
462467
) from None
@@ -500,6 +505,7 @@ async def _chat_with_model(
500505
# Chat with the model
501506
chat_result = await model_instance.chat(ctx, messages, chat_params)
502507
except Exception as e:
508+
logger.exception(f"Model chat failed: {e}")
503509
raise HTTPException(
504510
status_code=500, detail=f"Model chat failed: {str(e)}"
505511
) from e
@@ -508,6 +514,11 @@ async def _chat_with_model(
508514
# For simplicity, take the last message
509515
last_message = chat_result.messages[-1] if chat_result.messages else None
510516
if not last_message:
517+
logger.error(
518+
"Model returned no messages: "
519+
f"model_id={endpoint.model_id}, "
520+
f"chat_result_id={chat_result.id}"
521+
)
511522
raise HTTPException(status_code=500, detail="Model returned no messages")
512523

513524
return SummaryResponse(
@@ -550,6 +561,7 @@ async def publish_endpoint(
550561
HTTPException: If endpoint not found or marketplace repository not configured
551562
"""
552563
if not self.marketplace_repository:
564+
logger.error("Marketplace publishing is not configured")
553565
raise HTTPException(
554566
status_code=500,
555567
detail="Marketplace publishing is not configured",
@@ -607,6 +619,7 @@ async def unpublish_endpoint(
607619
) -> list[UnpublishResult]:
608620
"""Unpublish an endpoint from all its marketplaces."""
609621
if not self.marketplace_repository:
622+
logger.error("Marketplace unpublishing is not configured")
610623
raise HTTPException(
611624
status_code=500,
612625
detail="Marketplace publishing is not configured",
@@ -683,7 +696,7 @@ async def _unpublish_endpoint(
683696
error=e.message,
684697
)
685698
except Exception as e:
686-
logger.error(
699+
logger.exception(
687700
f"Failed to unpublish endpoint {endpoint.slug} from {marketplace.name}: {str(e)}"
688701
)
689702
return UnpublishResult(
@@ -798,6 +811,7 @@ async def check_slug_availability(
798811

799812
# Check marketplace availability
800813
if not self.marketplace_repository:
814+
logger.error("Marketplace checking is not configured")
801815
raise HTTPException(
802816
status_code=500,
803817
detail="Marketplace checking is not configured",
@@ -1017,7 +1031,7 @@ async def sync_endpoints_to_marketplaces(
10171031
f"Failed to sync to marketplace {marketplace_id}: {e.message}"
10181032
)
10191033
except Exception as e:
1020-
logger.error(
1034+
logger.exception(
10211035
f"Unexpected error syncing to marketplace {marketplace_id}: {e}"
10221036
)
10231037

backend/syft_space/components/ingestion/manager.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -286,14 +286,14 @@ async def _start_existing_dataset_watchers(self) -> None:
286286
try:
287287
await self.start_dataset_ingestion(dataset)
288288
except Exception as e:
289-
logger.error(
289+
logger.exception(
290290
f"Failed to start ingestion for '{dataset.name}': {e}"
291291
)
292292
logger.info(
293293
f"Background: Started ingestion for {len(all_datasets)} datasets"
294294
)
295295
except Exception as e:
296-
logger.error(f"Failed to start existing dataset watchers: {e}")
296+
logger.exception(f"Failed to start existing dataset watchers: {e}")
297297

298298
async def shutdown(self) -> None:
299299
"""Shutdown the ingestion manager."""
@@ -361,7 +361,7 @@ async def _event_processor_loop(self) -> None:
361361
# Queue was closed during shutdown
362362
break
363363
except Exception as e:
364-
logger.error(f"Error processing file event: {e}")
364+
logger.exception(f"Error processing file event: {e}")
365365

366366
logger.info("Event processor stopped")
367367

@@ -522,7 +522,7 @@ async def _process_single_job(self, job: IngestionJob) -> None:
522522
logger.info(f"Successfully ingested: {job.file_path}")
523523

524524
except Exception as e:
525-
logger.error(f"Failed to ingest {job.file_path}: {str(e)}")
525+
logger.exception(f"Failed to ingest {job.file_path}: {str(e)}")
526526
await self._ingestion_repository.update_status(
527527
job.id, IngestionJobStatus.FAILED, str(e)
528528
)
@@ -608,7 +608,7 @@ async def _start_watcher(self, dataset: Dataset) -> bool:
608608
f"(recursive={recursive})"
609609
)
610610
except Exception as e:
611-
logger.error(f"Failed to watch path {watch_path}: {e}")
611+
logger.exception(f"Failed to watch path {watch_path}: {e}")
612612

613613
self._watches[dataset.id] = watches
614614
return len(watches) > 0

backend/syft_space/components/settings/handlers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ async def configure_proxy(self, tenant: Tenant, token: str) -> ProxyStatusRespon
168168
try:
169169
public_url = await self.proxy_service.connect(token, marketplace.username)
170170
except Exception as e:
171-
logger.error(f"Failed to connect to ngrok: {e}")
171+
logger.exception(f"Failed to connect to ngrok: {e}")
172172
raise HTTPException(status_code=400, detail=str(e)) from e
173173

174174
await self.sync_public_url_to_marketplace(marketplace, public_url)

backend/syft_space/components/shared/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def do_run_migrations(connection) -> None:
152152
except Exception as stamp_error:
153153
logger.warning(f"Could not stamp database: {stamp_error}")
154154
else:
155-
logger.error(f"Migration failed in production mode: {e}")
155+
logger.exception(f"Migration failed in production mode: {e}")
156156
raise RuntimeError(
157157
f"Database migration failed. This is fatal in production. "
158158
f"Error: {e}"

backend/syft_space/components/shared/heartbeat_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ async def _heartbeat_loop(self) -> None:
166166
except asyncio.CancelledError:
167167
break
168168
except Exception as e:
169-
logger.error(f"Unexpected error in heartbeat loop: {e}")
169+
logger.exception(f"Unexpected error in heartbeat loop: {e}")
170170
# Wait before retrying
171171
try:
172172
await asyncio.wait_for(
@@ -297,7 +297,7 @@ async def _send_heartbeat_to_marketplace(
297297
)
298298
except Exception as e:
299299
self._update_state_on_failure(state)
300-
logger.error(
300+
logger.exception(
301301
f"Unexpected error sending heartbeat to {marketplace.name}: {e}"
302302
)
303303

backend/syft_space/components/shared/logging_config.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,35 @@
99
import inspect
1010
import logging
1111
import sys
12+
from pathlib import Path
1213

1314
from loguru import logger
1415

16+
from syft_space.config import app_settings
17+
1518
# Remove loguru's default handler
1619
logger.remove()
1720

18-
# Add handler with simplified format
21+
# Console handler (stderr)
22+
# colorize=None lets loguru auto-detect TTY (colors in terminals, plain in Docker/pipes)
1923
logger.add(
2024
sys.stderr,
2125
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <5}</level> | <cyan>{name}</cyan> - <level>{message}</level>",
22-
level="INFO",
23-
colorize=True,
26+
level=app_settings.log_level.upper(),
27+
colorize=None,
28+
)
29+
30+
# File handler
31+
log_path = Path(app_settings.log_file).expanduser()
32+
log_path.parent.mkdir(parents=True, exist_ok=True)
33+
logger.add(
34+
str(log_path.resolve()),
35+
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <5} | {name} - {message}",
36+
level=app_settings.log_level.upper(),
37+
rotation="50 MB",
38+
retention="7 days",
39+
compression="gz",
40+
colorize=False,
2441
)
2542

2643

backend/syft_space/components/shared/proxy_service.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ async def auto_connect_if_configured(self) -> None:
193193
await self._settings_repository.update_public_url(public_url)
194194
logger.info(f"Ngrok tunnel auto-connected: {public_url}")
195195
except Exception as e:
196-
logger.error(f"Failed to auto-connect ngrok tunnel: {e}")
196+
logger.exception(f"Failed to auto-connect ngrok tunnel: {e}")
197197
# Don't clear the token on auto-connect failure
198198
# User can retry or reconfigure
199199

@@ -204,7 +204,7 @@ async def _auto_connect_background(self) -> None:
204204
await self.auto_connect_if_configured()
205205
self.log_connection_info()
206206
except Exception as e:
207-
logger.error(f"Background proxy startup error: {e}")
207+
logger.exception(f"Background proxy startup error: {e}")
208208
finally:
209209
# Always signal completion (success or failure)
210210
if self._ready_event:
@@ -288,7 +288,7 @@ async def _reconnect_loop(self) -> None:
288288
delay = self.RECONNECT_INITIAL_DELAY # Reset delay on success
289289
logger.info("Ngrok tunnel reconnected successfully")
290290
except Exception as e:
291-
logger.error(f"Reconnection failed: {e}")
291+
logger.exception(f"Reconnection failed: {e}")
292292
# Exponential backoff
293293
delay = min(
294294
delay * self.RECONNECT_BACKOFF_FACTOR, self.RECONNECT_MAX_DELAY

0 commit comments

Comments
 (0)