open-edge-platform
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎metro-ai-suite/live-video-analysis/live-video-captioning/.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/AGENTS.md‎
Lines changed: 0 additions & 231 deletions b/‎metro-ai-suite/live-video-analysis/live-video-captioning/AGENTS.md‎
Lines changed: 0 additions & 231 deletions
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/README.md‎
Lines changed: 1 addition & 0 deletions b/‎metro-ai-suite/live-video-analysis/live-video-captioning/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/config.py‎
Lines changed: 3 additions & 0 deletions b/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/models/responses.py‎
Lines changed: 1 addition & 0 deletions b/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/models/responses.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/routes/runs.py‎
Lines changed: 16 additions & 6 deletions b/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/routes/runs.py‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/services/__init__.py‎
Lines changed: 10 additions & 1 deletion b/‎metro-ai-suite/live-video-analysis/live-video-captioning/app/backend/services/__init__.py‎
Lines changed: 10 additions & 1 deletion
@@ -6,3 +6,5 @@ ov_detection_models
 shared/
 *__pycache__*
 app/live_video_captioning.egg-info/*
+charts/Chart.lock
+charts/charts/*.tgz
@@ -116,128 +116,6 @@ pytest tests/ --cov=backend --cov=main --cov-fail-under=80
 # Note: Install linting tools as needed for your style checks
 ```
 
-### Running the Application Locally
-
-```bash
-# Development server (auto-reload on code changes)
-uvicorn main:app --host 127.0.0.1 --port 4173 --reload
-
-# Production server
-uvicorn main:app --host 0.0.0.0 --port 4173
-```
-
-### Docker & Compose
-
-```bash
-# Build and start all services
-docker-compose -f compose.yaml up --build
-
-# Stop all services
-docker-compose -f compose.yaml down
-
-# View logs from all services
-docker-compose -f compose.yaml logs -f
-```
-
-## Code Style & Conventions
-
-### Python Style Requirements
-
-- **Type Hints**: All functions must have type hints on parameters and return types
-- **Naming Conventions**:
-  - Functions/variables: `snake_case` (e.g., `get_mqtt_subscriber()`, `models_dir`)
-  - Classes: `PascalCase` (e.g., `APIResponse`, `MQTTSubscriber`)
-  - Constants: `UPPER_SNAKE_CASE` (e.g., `APP_PORT`, `MQTT_BROKER_HOST`)
-  - Private attributes/methods: Prefix with `_` (e.g., `_patch_config`, `_get()`)
-
-- **Docstrings**: Use module-level and function docstrings in Google style:
-  ```python
-  """Brief description spanning one line.
-  
-  Longer explanation with more detail if needed.
-  
-  Args:
-      param_name: Description of parameter
-      
-  Returns:
-      Description of return value
-      
-  Raises:
-      ExceptionType: When this occurs
-  """
-  ```
-
-### FastAPI Patterns
-
-✅ **Good** - Clear route with proper async handling and types:
-```python
-from fastapi import APIRouter, HTTPException
-from typing import Optional
-
-router = APIRouter(prefix="/runs", tags=["runs"])
-
-@router.get("/{run_id}")
-async def get_run_by_id(run_id: str) -> dict:
-    """Retrieve a specific run by ID."""
-    run = RUNS.get(run_id)
-    if not run:
-        raise HTTPException(status_code=404, detail="Run not found")
-    return run
-```
-
-❌ **Bad** - Missing types, no error handling, sync function with async I/O:
-```python
-@router.get("/{run_id}")
-def get_run(x):
-    return RUNS.get(x)
-```
-
-### Testing Patterns
-
-✅ **Good** - Named fixtures, clear assertions, proper async handling:
-```python
-@pytest.mark.asyncio
-async def test_mqtt_subscriber_connects(mock_mqtt):
-    """Test MQTT subscriber establishes connection."""
-    subscriber = await get_mqtt_subscriber()
-    assert subscriber is not None
-    subscriber.connect.assert_called_once()
-```
-
-❌ **Bad** - Generic test names, no setup isolation, missing assertions:
-```python
-def test_mqtt():
-    sub = get_mqtt_subscriber()
-    sub.connect()
-```
-
-### Import Organization
-
-1. Standard library imports first
-2. Third-party imports second
-3. Local imports last
-4. Blank line between each group
-
-```python
-import os
-import logging
-from pathlib import Path
-from contextlib import asynccontextmanager
-
-from fastapi import FastAPI
-from paho.mqtt.client import Client
-
-from backend.config import APP_PORT
-from backend.routes import config_router
-```
-
-### Configuration & Environment Variables
-
-- All environment variables are read in `backend/config.py` with defaults
-- Use `os.environ.get("VARIABLE_NAME", "default_value")`
-- For integer values, always validate: use `int(os.environ.get(...))` with try-except
-- Configuration is immutable after import; never modify `backend.config` values at runtime
-
 ## Testing Guidelines
 
 ### Test Organization
@@ -323,115 +201,6 @@ The FastAPI application exposes these route groups:
 🚫 **Never hardcode configuration values** - all config must go through `backend/config.py`
 🚫 **Never commit model files** - only include config, tokenizers, and references; models are downloaded at build time
 
-## Git Workflow
-
-### Branch Naming
-- Feature branches: `feature/description` (e.g., `feature/add-caption-filtering`)
-- Bug fixes: `fix/description` (e.g., `fix/mqtt-reconnection-issue`)
-- Documentation: `docs/description` (e.g., `docs/api-reference`)
-
-### Commit Message Format
-- Use descriptive, present-tense messages
-- Keep first line under 50 characters
-- Reference issue/task if applicable
-
-✅ **Good**:
-```
-feat: Add WebSocket caption streaming endpoint
-
-- Implement /runs/{run_id}/captions WebSocket
-- Add real-time frame processing
-- Include unit tests for streaming
-- Closes #42
-```
-
-❌ **Bad**:
-```
-fixed stuff
-update routes
-asdf
-```
-
-### Before Pushing
-
-1. Run full test suite: `pytest -v --cov=backend --cov=main --cov-fail-under=80`
-2. Verify no hardcoded secrets or credentials
-3. Ensure all type hints are present
-4. Check that docstrings are updated on new functions
-
-## Development Workflow for Agents
-
-**When implementing a feature:**
-
-1. **Understand the context**: Read the relevant route(`backend/routes/*.py`), service, and test files
-2. **Write tests first**: Create test cases in `app/tests/test_*.py` before implementing
-3. **Implement the feature**: Add code to the appropriate module (route, service, model)
-4. **Run tests immediately**: Use `pytest -v <test_file>` to verify your changes
-5. **Check coverage**: Run `pytest --cov=backend --cov-fail-under=80` to ensure 80%+ coverage
-6. **Review your code**: Verify type hints, docstrings, and naming conventions
-7. **Commit with clear message**: Reference the issue or task
-
-**Example workflow for adding a new endpoint:**
-
-```bash
-# 1. Create test file
-touch app/tests/test_routes_new_feature.py
-
-# 2. Write test cases
-# (edit test file with test_* functions)
-
-# 3. Run tests (they'll fail)
-pytest -v app/tests/test_routes_new_feature.py
-
-# 4. Implement the feature
-# (edit backend/routes/new_feature.py or modify existing routes)
-
-# 5. Run tests again
-pytest -v app/tests/test_routes_new_feature.py
-
-# 6. Check full coverage
-pytest -v --cov=backend --cov=main --cov-fail-under=80
-
-# 7. Commit
-git add app/tests/ app/backend/
-git commit -m "feat: Add new feature endpoint with tests"
-```
-
-## Debugging Tips
-
-### Enable Debug Logging
-```bash
-# Run app with DEBUG level logging
-LOGLEVEL=DEBUG uvicorn main:app --reload --log-level debug
-
-# Run tests with logging
-pytest -v tests/ --log-cli-level=DEBUG
-```
-
-### Inspect MQTT Messages
-```bash
-# Connect to MQTT broker and listen to topics
-mosquitto_sub -h localhost -p 1883 -t "live-video-captioning/#"
-```
-
-### Test Individual Routes
-```bash
-# Start dev server
-uvicorn main:app --reload
-
-# In another terminal, test endpoint
-curl -X GET http://localhost:4173/health
-curl -X GET http://localhost:4173/config
-```
-
-### Common Issues
-
-**Tests fail with config errors**: Make sure `conftest.py` has `@pytest.fixture(autouse=True)` for `_patch_config` - it should auto-patch environment variables before tests run
-
-**MQTT connection times out in tests**: Verify `mock_mqtt` fixture is auto-used in `conftest.py` - it stubs out real MQTT connections
-
-**Coverage threshold fails**: Run `pytest --cov-report=html` and open `htmlcov/index.html` to see which lines aren't covered
-
 ## Key Files Reference
 
 | File | Purpose |
 
@@ -25,6 +25,7 @@ For more information see [How it works](./docs/user-guide/how-it-works.md)
 - [Overview](./docs/user-guide/index.md)
 - [System Requirements](./docs/user-guide/get-started/system-requirements.md)
 - [Get Started](./docs/user-guide/get-started.md)
+- [Deploy with Helm](./docs/user-guide/deploy-with-helm.md)
 - [API Reference](./docs/user-guide/api-reference.md)
 - [How to Build Source](./docs/user-guide/get-started/build-from-source.md)
 - [Known Issues](./docs/user-guide/known-issues.md)
 
@@ -37,6 +37,9 @@ def _read_non_negative_int(var_name: str, default: int) -> int:
     "PIPELINE_SERVER_URL", "http://dlstreamer-pipeline-server:8080"
 )
 PIPELINE_NAME = os.environ.get("PIPELINE_NAME", "genai_pipeline")
+# How often (in seconds) to poll the pipeline server for run health. 0 disables polling.
+# Keep this low (≤10 s) so the UI reflects a crashed pipeline server quickly.
+PIPELINE_POLL_INTERVAL = _read_non_negative_int("PIPELINE_POLL_INTERVAL", 8)
 
 BASE_DIR = Path(__file__).parent.parent
 MODELS_DIR = Path(os.environ.get("MODELS_DIR", str(BASE_DIR / "ov_models")))
 
@@ -10,6 +10,7 @@ class RunInfo(BaseModel):
     pipelineId: str
     peerId: str
     mqttTopic: str
+    status: str = "running"
     modelName: Optional[str] = None
     pipelineName: Optional[str] = None
     runName: Optional[str] = None
 
@@ -173,7 +173,12 @@ async def list_runs() -> list[RunInfo]:
 
 
 async def _multiplexed_metadata_generator() -> AsyncGenerator[str, None]:
-    """Generator that receives metadata from MQTT and multiplexes into a single SSE stream."""
+    """Generator that receives metadata from MQTT and multiplexes into a single SSE stream.
+
+    A status heartbeat is sent every second when no MQTT message arrives, carrying
+    the current status of every active run so the frontend can react when a run
+    transitions to ``"error"`` (detected by the background health monitor).
+    """
     message_queue: asyncio.Queue = asyncio.Queue()
     subscribed_runs: set[str] = set()
 
@@ -186,8 +191,8 @@ def on_message(run_id: str, data: dict, received_at: float):
         except Exception as e:
             logger.error(f"Error queueing MQTT message: {e}")
 
+    mqtt_subscriber = await get_mqtt_subscriber()
     try:
-        mqtt_subscriber = await get_mqtt_subscriber()
 
         while True:
             try:
@@ -223,17 +228,22 @@ def on_message(run_id: str, data: dict, received_at: float):
                     yield f"data: {json.dumps(envelope)}\n\n"
 
                 except asyncio.TimeoutError:
-                    # No message received, send heartbeat
-                    yield ": heartbeat\n\n"
+                    # No MQTT message – send a status heartbeat so the frontend
+                    # learns when a run transitions to "error".
+                    status_payload = {
+                        "type": "status",
+                        "runs": {rid: info.status for rid, info in RUNS.items()},
+                    }
+                    yield f"data: {json.dumps(status_payload)}\n\n"
 
             except Exception as e:
                 logger.error(f"Error in multiplexed metadata generator: {e}")
                 yield f": error - {e}\n\n"
                 await asyncio.sleep(1)
 
     finally:
-        # Cleanup subscriptions when generator is closed
-        mqtt_subscriber = await get_mqtt_subscriber()
+        # Reuse the already-resolved subscriber — avoids creating a new connection
+        # during app shutdown when the global subscriber may already be torn down.
         for run_id in subscribed_runs:
             mqtt_subscriber.unsubscribe_from_run(run_id)
         logger.info("Cleaned up MQTT subscriptions")
 
@@ -4,19 +4,28 @@
     discover_detection_models,
     discover_pipelines_remote,
 )
-from .http_client import http_json
+from .http_client import http_json, try_get_json
 from .mqtt_subscriber import (
     MQTTSubscriber,
     get_mqtt_subscriber,
     shutdown_mqtt_subscriber,
 )
+from .pipeline_health import (
+    check_pipeline_health,
+    start_pipeline_health_monitor,
+    stop_pipeline_health_monitor,
+)
 
 __all__ = [
     "discover_models",
     "discover_detection_models",
     "discover_pipelines_remote",
     "http_json",
+    "try_get_json",
     "MQTTSubscriber",
     "get_mqtt_subscriber",
     "shutdown_mqtt_subscriber",
+    "check_pipeline_health",
+    "start_pipeline_health_monitor",
+    "stop_pipeline_health_monitor",
 ]
Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,9 @@ def _read_non_negative_int(var_name: str, default: int) -> int:`
`37`	`37`	`"PIPELINE_SERVER_URL", "http://dlstreamer-pipeline-server:8080"`
`38`	`38`	`)`
`39`	`39`	`PIPELINE_NAME = os.environ.get("PIPELINE_NAME", "genai_pipeline")`
	`40`	`+# How often (in seconds) to poll the pipeline server for run health. 0 disables polling.`
	`41`	`+# Keep this low (≤10 s) so the UI reflects a crashed pipeline server quickly.`
	`42`	`+PIPELINE_POLL_INTERVAL = _read_non_negative_int("PIPELINE_POLL_INTERVAL", 8)`
`40`	`43`
`41`	`44`	`BASE_DIR = Path(__file__).parent.parent`
`42`	`45`	`MODELS_DIR = Path(os.environ.get("MODELS_DIR", str(BASE_DIR / "ov_models")))`