EPFLiGHT
diff --git a/‎configs/config_image_gen_diffusers.yaml‎
Lines changed: 121 additions & 0 deletions b/‎configs/config_image_gen_diffusers.yaml‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎configs/config_image_gen_sglang.yaml‎
Lines changed: 142 additions & 0 deletions b/‎configs/config_image_gen_sglang.yaml‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎configs/config_mock_image_gen.yaml‎ ‎configs/config_mock_image_gen_local.yaml‎configs/config_mock_image_gen.yaml renamed to configs/config_mock_image_gen_local.yaml
Lines changed: 2 additions & 1 deletion b/‎configs/config_mock_image_gen.yaml‎ ‎configs/config_mock_image_gen_local.yaml‎configs/config_mock_image_gen.yaml renamed to configs/config_mock_image_gen_local.yaml
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/mmirage/core/process/base.py‎
Lines changed: 7 additions & 0 deletions b/‎src/mmirage/core/process/base.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/mmirage/core/process/mapper.py‎
Lines changed: 5 additions & 0 deletions b/‎src/mmirage/core/process/mapper.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/mmirage/core/process/processors/image_gen/backends/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎src/mmirage/core/process/processors/image_gen/backends/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/mmirage/core/process/processors/image_gen/backends/base.py‎
Lines changed: 51 additions & 0 deletions b/‎src/mmirage/core/process/processors/image_gen/backends/base.py‎
Lines changed: 51 additions & 0 deletions
@@ -0,0 +1,121 @@
+# Local Diffusers image generation example
+# Runs an in-process Diffusers pipeline from a local model path.
+
+processors:
+  - type: image_gen
+    backend: diffusers
+    pipeline_args:
+      model_path: stable-diffusion-v1-5/stable-diffusion-v1-5
+      device: auto                 # auto: multi-GPU via device_map, single GPU, or CPU
+      torch_dtype: float16
+      enable_attention_slicing: false
+    default_sampling_params:
+      num_inference_steps: 30
+      guidance_scale: 4.0
+    parallel_inference: true
+    parallel_chunk_size: 4        # samples per batched GPU call
+    output_dir: /users/qchapp/meditron/MIRAGE/tests/output/image_gen/generated_images
+    file_format: png
+
+loading_params:
+  state_dir: /users/qchapp/meditron/MIRAGE/tests/output/image_gen/_pipeline_state
+  datasets:
+    - path: /users/qchapp/meditron/MIRAGE/tests/mock_data_image_gen/data.jsonl
+      type: JSONL
+      output_dir: /users/qchapp/meditron/MIRAGE/tests/output/image_gen
+  num_shards: 4  # set by Slurm array job
+  shard_id: ${SLURM_ARRAY_TASK_ID}       # set by Slurm array job
+  batch_size: 64
+
+processing_params:
+  inputs:
+    - name: text
+      key: caption
+
+  outputs:
+    - name: generated_image
+      type: image_gen
+      output_mode: path
+      # __sample_index is shard-local; combine with __shard_id for global uniqueness
+      filename_template: "img_{{ __shard_id }}_{{ __sample_index }}_{{ __source_hash }}"
+      width: 1024
+      height: 1024
+      seed: 42    # shard-aware: effective = 42 + shard_id * 1_000_000_000 + sample_index
+      prompt: |
+        A photorealistic image of: {{ text }}
+
+  remove_columns: false
+  output_schema:
+    caption: "{{ text }}"
+    image: "{{ generated_image }}"
+
+execution_params:
+  # Execution mode: "local" or "slurm"
+  # - local: Run directly on this machine
+  # - slurm: Submit jobs to SLURM cluster
+  mode: slurm
+
+  # Whether the canonical `run` command should automatically retry failed shards.
+  # - false: submit one run only
+  # - true: submit, wait, and keep retrying failed shards until success or retry budget exhaustion
+  retry: false
+
+  # Maximum number of times to retry a failed shard (default: 3)
+  max_retries: 3
+
+  # ==========================================================================
+  # SLURM CONFIGURATION (only used when mode: slurm)
+  # ==========================================================================
+
+  # HPC account/partition to charge jobs to (REQUIRED for SLURM mode)
+  account: a127
+
+  # SLURM job name (default: "mmirage-sharded")
+  job_name: mmirage-sharded
+
+  # Optional SLURM reservation name (leave blank or omit to not use)
+  # reservation: "sai-a127"
+
+  # Number of nodes (default: 1)
+  nodes: 1
+
+  # Number of tasks per node (default: 1)
+  ntasks_per_node: 1
+
+  # Number of GPUs per node (default: 4)
+  gpus: 4
+
+  # Number of CPUs per task (default: 288)
+  cpus_per_task: 288
+
+  # Job time limit in HH:MM:SS format (default: "11:59:59")
+  time_limit: "11:59:59"
+
+  # ==========================================================================
+  # PATH CONFIGURATION
+  # ==========================================================================
+  # These support environment variables ($VAR or ${VAR}) and home directory (~)
+
+  # Project root directory (used as base for relative paths)
+  # If not set, uses current working directory
+  # project_root: "/path/to/project"
+
+  # Directory for SLURM output and error files (default: ~/reports)
+  report_dir: "/users/${USER}/reports"
+
+  # HuggingFace cache directory (default: ~/hf)
+  hf_home: "/capstor/store/cscs/swissai/a127/homes/${USER}/hf"
+
+  # EDF environment file path for cluster-specific setup
+  edf_env: "/users/${USER}/.edf/sglang.toml"
+
+  # ==========================================================================
+  # JOB MONITORING (for "submit" and retry orchestration)
+  # ==========================================================================
+
+  # Seconds to wait between checking job status (default: 30)
+  poll_interval_seconds: 30
+
+  # Seconds to wait after job completes before checking results (default: 60)
+  # This allows filesystem to settle on distributed systems
+  settle_time_seconds: 60
@@ -0,0 +1,142 @@
+# SGLang Diffusion server image generation example
+#
+# Two launch modes are available:
+#
+#   launch_mode: managed   (recommended)
+#     MMIRAGE automatically starts a SGLang server on each worker node,
+#     waits until it is ready, runs the pipeline, and shuts it down afterwards.
+#     No manual server management is needed.
+#
+#   launch_mode: external
+#     You are responsible for starting the SGLang server before the pipeline
+#     runs.  Use this if you want to reuse a long-running server or need
+#     fine-grained control over server startup.
+#
+# MMIRAGE handles all dataset sharding, prompt rendering, filename rendering,
+# and result saving.  The SGLang server is only responsible for generating
+# image pixels.
+
+processors:
+  - type: image_gen
+    backend: sglang
+    sglang:
+      launch_mode: managed                       # MMIRAGE starts/stops the server automatically
+      model_path: stable-diffusion-v1-5/stable-diffusion-v1-5
+      port: 30010
+      num_gpus: 4                                # passed as --tp to sglang.launch_server
+      # dtype: float16                           # optional: --dtype flag
+      startup_timeout_seconds: 180               # seconds to wait for the server to become ready
+      # extra_server_args:                       # any additional --flags for sglang.launch_server
+      #   - "--mem-fraction-static"
+      #   - "0.9"
+      api_key: EMPTY                             # unauthenticated local server
+      timeout_seconds: 900
+    default_sampling_params:
+      num_inference_steps: 30
+      guidance_scale: 4.0
+    parallel_inference: true
+    parallel_chunk_size: 4        # concurrent requests per chunk (sequential per sample inside the backend)
+    output_dir: /users/qchapp/meditron/MIRAGE/tests/output/image_gen/generated_images
+    file_format: png
+
+loading_params:
+  state_dir: /users/qchapp/meditron/MIRAGE/tests/output/image_gen/_pipeline_state
+  datasets:
+    - path: /users/qchapp/meditron/MIRAGE/tests/mock_data_image_gen/data.jsonl
+      type: JSONL
+      output_dir: /users/qchapp/meditron/MIRAGE/tests/output/image_gen
+  num_shards: 4  # each Slurm task starts its own server on localhost
+  shard_id: ${SLURM_ARRAY_TASK_ID}
+  batch_size: 64
+
+processing_params:
+  inputs:
+    - name: text
+      key: caption
+
+  outputs:
+    - name: generated_image
+      type: image_gen
+      output_mode: path
+      filename_template: "img_{{ __shard_id }}_{{ __sample_index }}_{{ __source_hash }}"
+      width: 1024
+      height: 1024
+      seed: 42    # shard-aware: effective seed = 42 + shard_id * 1_000_000_000 + sample_index
+      prompt: |
+        A photorealistic image of: {{ text }}
+
+  remove_columns: false
+  output_schema:
+    caption: "{{ text }}"
+    image: "{{ generated_image }}"
+
+execution_params:
+  # Execution mode: "local" or "slurm"
+  # - local: Run directly on this machine
+  # - slurm: Submit jobs to SLURM cluster
+  mode: slurm
+
+  # Whether the canonical `run` command should automatically retry failed shards.
+  # - false: submit one run only
+  # - true: submit, wait, and keep retrying failed shards until success or retry budget exhaustion
+  retry: false
+
+  # Maximum number of times to retry a failed shard (default: 3)
+  max_retries: 3
+
+  # ==========================================================================
+  # SLURM CONFIGURATION (only used when mode: slurm)
+  # ==========================================================================
+
+  # HPC account/partition to charge jobs to (REQUIRED for SLURM mode)
+  account: a127
+
+  # SLURM job name (default: "mmirage-sharded")
+  job_name: mmirage-sharded
+
+  # Optional SLURM reservation name (leave blank or omit to not use)
+  # reservation: "sai-a127"
+
+  # Number of nodes (default: 1)
+  nodes: 1
+
+  # Number of tasks per node (default: 1)
+  ntasks_per_node: 1
+
+  # Number of GPUs per node (default: 4)
+  gpus: 4
+
+  # Number of CPUs per task (default: 288)
+  cpus_per_task: 288
+
+  # Job time limit in HH:MM:SS format (default: "11:59:59")
+  time_limit: "11:59:59"
+
+  # ==========================================================================
+  # PATH CONFIGURATION
+  # ==========================================================================
+  # These support environment variables ($VAR or ${VAR}) and home directory (~)
+
+  # Project root directory (used as base for relative paths)
+  # If not set, uses current working directory
+  # project_root: "/path/to/project"
+
+  # Directory for SLURM output and error files (default: ~/reports)
+  report_dir: "/users/${USER}/reports"
+
+  # HuggingFace cache directory (default: ~/hf)
+  hf_home: "/capstor/store/cscs/swissai/a127/homes/${USER}/hf"
+
+  # EDF environment file path for cluster-specific setup
+  edf_env: "/users/${USER}/.edf/sglang.toml"
+
+  # ==========================================================================
+  # JOB MONITORING (for "submit" and retry orchestration)
+  # ==========================================================================
+
+  # Seconds to wait between checking job status (default: 30)
+  poll_interval_seconds: 30
+
+  # Seconds to wait after job completes before checking results (default: 60)
+  # This allows filesystem to settle on distributed systems
+  settle_time_seconds: 60
@@ -1,10 +1,11 @@
 processors:
   - type: image_gen
+    backend: diffusers
     pipeline_args:
       model_path: stable-diffusion-v1-5/stable-diffusion-v1-5
       torch_dtype: float16
       device: auto
-      enable_attention_slicing: true
+      enable_attention_slicing: false
     default_sampling_params:
       num_inference_steps: 20
       guidance_scale: 7.5
 
@@ -78,6 +78,13 @@ def batch_process_sample(
         """
         raise NotImplementedError()
 
+    def shutdown(self) -> None:
+        """Release any resources held by this processor.
+
+        Override in subclasses that hold GPU memory, open file handles, or
+        network connections.  The default implementation is a no-op.
+        """
+
 
 class ProcessorRegistry:
     """Registry for managing and accessing available processors.
 
@@ -105,3 +105,8 @@ def rewrite_batch(
             )
 
         return batch_environment
+
+    def shutdown(self) -> None:
+        """Shut down all processors and release their resources."""
+        for processor in self.processors.values():
+            processor.shutdown()
@@ -0,0 +1,5 @@
+"""Image generation backends for MMIRAGE."""
+
+from mmirage.core.process.processors.image_gen.backends.base import ImageGenerationBackend
+
+__all__ = ["ImageGenerationBackend"]
@@ -0,0 +1,51 @@
+"""Image generation backend protocol for MMIRAGE."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+try:
+    from typing import Protocol, runtime_checkable
+except ImportError:  # pragma: no cover
+    from typing_extensions import Protocol, runtime_checkable  # type: ignore
+
+
+@runtime_checkable
+class ImageGenerationBackend(Protocol):
+    """Protocol for pluggable image generation backends.
+
+    All backends receive pre-rendered prompts and pre-computed per-sample seeds
+    from the processor.  The processor handles all Jinja template rendering,
+    filename generation, and result bookkeeping; the backend is responsible
+    only for turning prompts + params into PIL images.
+    """
+
+    def generate_batch(
+        self,
+        prompts: List[str],
+        negative_prompts: Optional[List[Optional[str]]],
+        params: Dict[str, Any],
+        seeds: List[Optional[int]],
+    ) -> List[Any]:
+        """Generate one image per prompt.
+
+        Args:
+            prompts: Positive prompt strings, one per sample.
+            negative_prompts: Optional list of negative prompts aligned with
+                ``prompts``.  ``None`` means no negative prompts at all;
+                individual ``None`` elements mean no negative prompt for that
+                sample.
+            params: Shared generation kwargs (width, height,
+                num_inference_steps, guidance_scale, …).
+            seeds: Per-sample integer seeds for deterministic generation, or
+                ``None`` elements for unseeded samples.  The list is always
+                the same length as ``prompts``.
+
+        Returns:
+            List of ``PIL.Image`` objects, one per prompt, in the same order.
+        """
+        ...
+
+    def shutdown(self) -> None:
+        """Release any resources held by the backend."""
+        ...
Original file line number	Diff line number	Diff line change
`@@ -105,3 +105,8 @@ def rewrite_batch(`
`105`	`105`	`)`
`106`	`106`
`107`	`107`	`return batch_environment`
	`108`	`+`
	`109`	`+ def shutdown(self) -> None:`
	`110`	`+ """Shut down all processors and release their resources."""`
	`111`	`+ for processor in self.processors.values():`
	`112`	`+ processor.shutdown()`