open-edge-platform · oonyshch · Feb 27, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
@@ -0,0 +1,76 @@
+# VLM Alerts
+
+This sample demonstrates how to download a Vision-Language Model (VLM) from Hugging Face, export it to OpenVINO IR using `optimum-cli`, and run inference in a DL Streamer pipeline.
+
+The pipeline saves both JSON metadata and an encoded MP4 output.
+
+## How It Works
+
+The script performs three main steps:
+
+STEP 1 — Prepare input video  
+If a local file is provided, it is used directly.  
+If a URL is provided, the video is downloaded automatically into the `videos/` directory.
+
+STEP 2 — Prepare VLM model  
+
+Exported artifacts are stored under:
+
+    models/<ModelName>
+
+STEP 3 — Build and run the pipeline  
+
+The GStreamer pipeline includes:
+
+- gvagenai for VLM inference  
+- gvametapublish for JSON output  
+- gvafpscounter for performance display  
+- gvawatermark for overlay  
+- vah264enc for hardware encoding  
+
+The output video and metadata are written to the `results/` directory.
+
+## Setup
+
+From the sample directory:
+
+    cd samples/gstreamer/python/vlm_alerts
+
+Create and activate a virtual environment:
+
+    python3 -m venv .venv --system-site-packages
+    source .venv/bin/activate
+
+Install dependencies:
+
+    pip install -r requirements.txt
+
+## Running
+
+    python3 ./vlm_alerts.py <input_video_or_url> <hf_model_id> "<question>"
+
+Example:
+
+    python3 ./vlm_alerts.py \
+    https://videos.pexels.com/video-files/2103099/2103099-hd_1280_720_60fps.mp4 \
+    OpenGVLab/InternVL3_5-2B \
+    "Is there a police car? Answer yes or no."
+
+## Output
+
+After execution:
+
+JSON metadata:
+
+    results/<model>-<video>.jsonl
+
+Annotated video:
+
+    results/<model>-<video>.mp4
+
+## Notes
+
+- Each video and model are downloaded and exported once.
+- Different VLMs can be downloaded. Suggested: OpenGVLab/InternVL3_5-2B, openbmb/MiniCPM-V-4_5, Qwen/Qwen2.5-VL-3B-Instruct.
+- Subsequent runs reuse cached assets.
+- GPU is used by default.
diff --git a/samples/gstreamer/python/vlm_alerts/requirements.txt b/samples/gstreamer/python/vlm_alerts/requirements.txt
@@ -0,0 +1,10 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+PyGObject==3.50.0
+torch==2.9.0+cpu
+transformers==4.57.6
+optimum-intel==1.27.0
+huggingface_hub==0.36.1
+einops
+timm
+openvino==2025.4.0
+openvino_tokenizers==2025.4.0.0
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""
+Run a DLStreamer VLM pipeline on a video and export JSON and MP4 results.
+
+The script can:
+1. Download or reuse a local video.
+2. Export or reuse an OpenVINO model.
+3. Build a GStreamer pipeline string.
+4. Execute the pipeline and store results.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import tempfile
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Tuple
+
+import gi
+from gi.repository import Gst, GLib
+gi.require_version("Gst", "1.0")
+
+
+BASE_DIR = Path(__file__).resolve().parent
+VIDEOS_DIR = BASE_DIR / "videos"
+MODELS_DIR = BASE_DIR / "models"
+RESULTS_DIR = BASE_DIR / "results"
+
+
+@dataclass
+class PipelineConfig:
+    """Configuration required to build and run the pipeline."""
+
+    video: Path
+    model: Path
+    question: str
+    device: str
+    max_tokens: int
+    frame_rate: float
+
+
+def ensure_video(path_or_url: str) -> Path:
+    """Return a local video path, downloading it if needed."""
+    candidate = Path(path_or_url)
+    if candidate.is_file():
+        return candidate.resolve()
+
+    VIDEOS_DIR.mkdir(exist_ok=True)
+    filename = path_or_url.rstrip("/").split("/")[-1]
+    local_path = VIDEOS_DIR / filename
+
+    if local_path.exists():
+        print(f"[video] using cached {local_path}")
+        return local_path.resolve()
+
+    print(f"[video] downloading {path_or_url}")
+    request = urllib.request.Request(
+        path_or_url,
+        headers={"User-Agent": "Mozilla/5.0"},
+    )
+
+    with urllib.request.urlopen(request) as response, open(local_path, "wb") as file:
+        file.write(response.read())
+
+    return local_path.resolve()
+
+
+def ensure_model(model_id: str) -> Path:
+    """Return a local OpenVINO model directory, exporting it if needed."""
+    model_name = model_id.split("/")[-1]
+    output_dir = MODELS_DIR / model_name
+
+    if output_dir.exists() and any(output_dir.glob("*.xml")):
+        print(f"[model] using cached {output_dir}")
+        return output_dir.resolve()
+
+    MODELS_DIR.mkdir(exist_ok=True)
+
+    command = [
+        "optimum-cli",
+        "export",
+        "openvino",
+        "--model",
+        model_id,
+        "--task",
+        "image-text-to-text",
+        "--trust-remote-code",
+        str(output_dir),
+    ]
+
+    print("[model] exporting:", " ".join(command))
+    subprocess.run(command, check=True)
+
+    if not any(output_dir.glob("*.xml")):
+        raise RuntimeError("OpenVINO export failed, no XML files found")
+
+    return output_dir.resolve()
+
+
+def build_pipeline_string(cfg: PipelineConfig) -> Tuple[str, Path, Path, Path]:
+    """Construct the GStreamer pipeline string and related output paths."""
+    RESULTS_DIR.mkdir(exist_ok=True)
+
+    output_json = RESULTS_DIR / f"{cfg.model.name}-{cfg.video.stem}.jsonl"
+    output_video = RESULTS_DIR / f"{cfg.model.name}-{cfg.video.stem}.mp4"
+
+    fd, prompt_path_str = tempfile.mkstemp(suffix=".txt")
+    prompt_path = Path(prompt_path_str)
+    with os.fdopen(fd, "w") as file:
+        file.write(cfg.question)
+
+    generation_cfg = f"max_new_tokens={cfg.max_tokens}"
+
+    pipeline_str = (
+        f'filesrc location="{cfg.video}" ! '
+        f'decodebin3 ! '
+        f'videoconvertscale ! '
+        f'video/x-raw,format=BGRx,width=1280,height=720 ! '
+        f'queue ! '
+        f'gvagenai '
+        f'model-path="{cfg.model}" '
+        f'device={cfg.device} '
+        f'prompt-path="{prompt_path}" '
+        f'generation-config="{generation_cfg}" '
+        f'chunk-size=1 '
+        f'frame-rate={cfg.frame_rate} '
+        f'metrics=true ! '
+        f'queue ! '
+        f'gvametapublish file-format=json-lines '
+        f'file-path="{output_json}" ! '
+        f'queue ! '
+        f'gvafpscounter ! '
+        f'gvawatermark displ-cfg=text-scale=0.5 ! '
+        f'videoconvert ! '
+        f'vah264enc ! '
+        f'h264parse ! '
+        f'mp4mux ! '
+        f'filesink location="{output_video}"'
+    )
+
+    return pipeline_str, output_json, output_video, prompt_path
+
+
+def run_pipeline_string(pipeline_str: str) -> int:
+    """Execute a GStreamer pipeline string and block until completion."""
+    Gst.init(None)
+
+    try:
+        pipeline = Gst.parse_launch(pipeline_str)
+    except GLib.Error as error:
+        print("Pipeline parse error:", str(error))
+        return 1
+
+    bus = pipeline.get_bus()
+    pipeline.set_state(Gst.State.PLAYING)
+
+    while True:
+        message = bus.timed_pop_filtered(
+            Gst.CLOCK_TIME_NONE,
+            Gst.MessageType.ERROR | Gst.MessageType.EOS,
+        )
+
+        if message.type == Gst.MessageType.ERROR:
+            err, debug = message.parse_error()
+            print("ERROR:", err.message)
+            if debug:
+                print("DEBUG:", debug)
+            pipeline.set_state(Gst.State.NULL)
+            return 1
+
+        if message.type == Gst.MessageType.EOS:
+            pipeline.set_state(Gst.State.NULL)
+            return 0
+
+
+def run_pipeline(cfg: PipelineConfig) -> int:
+    """Build and execute the pipeline from configuration."""
+    pipeline_str, output_json, output_video, prompt_path = build_pipeline_string(cfg)
+
+    print("\nPipeline:\n")
+    print(pipeline_str)
+    print()
+
+    try:
+        result = run_pipeline_string(pipeline_str)
+    finally:
+        if prompt_path.exists():
+            prompt_path.unlink()
+
+    if result == 0:
+        print(f"\nJSON output:  {output_json}")
+        print(f"Video output: {output_video}")
+
+    return result
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="DLStreamer VLM Alerts sample"
+    )
+    parser.add_argument("video")
+    parser.add_argument("model")
+    parser.add_argument("question")
+    parser.add_argument("--device", default="GPU")
+    parser.add_argument("--max-tokens", type=int, default=20)
+    parser.add_argument("--frame-rate", type=float, default=1.0)
+
+    return parser.parse_args()
+
+
+def main() -> int:
+    """Entry point."""
+    args = parse_args()
+
+    video_path = ensure_video(args.video)
+    model_path = ensure_model(args.model)
+
+    config = PipelineConfig(
+        video=video_path,
+        model=model_path,
+        question=args.question,
+        device=args.device,
+        max_tokens=args.max_tokens,
+        frame_rate=args.frame_rate,
+    )
+
+    return run_pipeline(config)
+
+
+if __name__ == "__main__":
+    sys.exit(main())