EPFLiGHT · fabnemEPFL · Mar 25, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/.gitignore b/.gitignore
@@ -168,3 +168,6 @@ else/
 # Test outputs
 tests/mock_data/output/
 tests/mock_data/shards/
+
+# devcontainer
+.devcontainer/
diff --git a/README.md b/README.md
@@ -32,6 +32,29 @@ For testing and scripts that make use of the library, it is advised to create a
 
 ## Example usage
 
+### Running (single command)
+
+Run the pipeline via the Python CLI. Retry behavior is driven by your YAML config:
+
+- `execution_params.retry: true` → automatically retries failed shards until completion or `max_retries`
+- `execution_params.retry: false` → submits/runs once; you can later trigger retries via `check`
+
+```bash
+python -m mmirage.cli run --config configs/config_mock.yaml
+```
+
+To check status and (optionally) submit retries for failed shards:
+
+```bash
+python -m mmirage.cli check --config configs/config_mock.yaml
+```
+
+If you only want the status summary (no retry submission):
+
+```bash
+python -m mmirage.cli check --config configs/config_mock.yaml --summary-only
+```
+
 ### Text-only: Reformatting dataset
 
 Suppose you have a dataset with samples of the following format
@@ -58,11 +81,12 @@ processors:
       max_new_tokens: 384
 
 loading_params:
+  state_dir: /path/to/state/dir
   datasets:
     - path: /path/to/dataset
       type: loadable
       output_dir: /path/to/output/shards
-  num_shards: "$SLURM_ARRAY_TASK_COUNT"
+  num_shards: 4
   shard_id: "$SLURM_ARRAY_TASK_ID"
   batch_size: 64
 
@@ -91,12 +115,17 @@ processing_params:
       - role: assistant
         content: "{{ formatted_answer }}"
     modalities: "{{ modalities }}"
+
+execution_params:
+  mode: local
+  retry: false
 ```
 
 Configuration explanation:
 
 - `processors`: List of processor configurations. Currently supports `llm` type for LLM-based generation.
 - `loading_params`: Parameters for loading and sharding datasets.
+  - `state_dir`: Optional shared directory for shard status/retry state. Defaults to `~/.cache/MMIRAGE/state_dir`.
   - `datasets`: List of dataset configurations with path, type, and output directory.
 - `processing_params`:
   - `inputs`: Variables extracted from the input dataset using JMESPath queries.
@@ -121,11 +150,12 @@ processors:
       max_new_tokens: 768
 
 loading_params:
+  state_dir: path/to/state/dir
   datasets:
     - path: /path/to/image/dataset
       type: loadable
       output_dir: /path/to/output/shards
-  num_shards: "$SLURM_ARRAY_TASK_COUNT"
+  num_shards: 4
   shard_id: "$SLURM_ARRAY_TASK_ID"
   batch_size: 32
 
@@ -152,6 +182,10 @@ processing_params:
     image: "{{ medical_image }}"
     caption: "{{ enhanced_caption }}"
     original_caption: "{{ original_caption }}"
+
+execution_params:
+  mode: local
+  retry: false
 ```
 
 Key multimodal features:

diff --git a/configs/config_comprehensive.yaml b/configs/config_comprehensive.yaml
@@ -0,0 +1,200 @@
+# MMIRAGE Configuration with all parameters
+#
+# This is a comprehensive example showing all available configuration options.
+# You can copy and modify this file for your specific use case.
+#
+# Parameters are organized into sections:
+# 1. processors - LLM and other data transformation processors
+# 2. loading_params - Dataset loading and sharding configuration
+# 3. processing_params - How to transform/process the data
+# 4. execution_params - SLURM, retry, and execution settings
+#
+
+# ============================================================================
+# PROCESSORS CONFIGURATION
+# ============================================================================
+# Define the processors used to transform your data.
+# Common types: llm, vision_llm, etc.
+
+processors:
+  - type: llm
+    server_args:
+      model_path: Qwen/Qwen3-4B-Instruct-2507
+      tp_size: 1
+      disable_custom_all_reduce: true
+    default_sampling_params:
+      temperature: 0.1
+      top_p: 0.9
+      max_new_tokens: 1024
+      custom_params:
+        chat_template_kwargs:
+          enable_thinking: false
+
+
+# ============================================================================
+# LOADING PARAMETERS
+# ============================================================================
+# Configure how datasets are loaded, sharded, and processed.
+
+loading_params:
+  # Directory to store pipeline state (checkpoints, status, retry tracking)
+  # Supports environment variables: $VAR or ${VAR}
+  state_dir: tests/output/data/_pipeline_state
+
+  # Dataset configurations to load
+  # Each dataset can be separately sharded and output
+  datasets:
+    - path: tests/mock_data/data.jsonl
+      type: JSONL
+      output_dir: tests/output/data
+      # image_base_path: /path/to/images  # Optional, for vision tasks
+
+  # Total number of shards to split datasets into.
+  # For SLURM, this determines the array job size.
+  num_shards: 4
+
+  # Shard ID for this process (0-indexed).
+  # In SLURM array jobs, this is set automatically.
+  shard_id: "$SLURM_ARRAY_TASK_ID"
+
+  # Batch size for processing samples
+  batch_size: 64
+
+
+# ============================================================================
+# PROCESSING PARAMETERS
+# ============================================================================
+# Define what to extract, transform, and output from each sample.
+
+processing_params:
+  # Input variables to extract from source data
+  inputs:
+    - name: text
+      key: text
+    # For vision examples:
+    # - name: image
+    #   key: image_path
+    #   type: image
+
+  # Output variables generated by processors
+  outputs:
+    - name: formatted_answer
+      type: llm
+      output_type: JSON
+      output_schema:
+        - question
+        - answer
+      prompt: |
+        Generate one question and its corresponding answer using the following text:
+        ```
+        {{ text }}
+        ```
+
+  # Whether to remove original columns from the dataset
+  remove_columns: true
+
+  # Output schema: how to structure the final dataset
+  output_schema:
+    conversations:
+      - role: "user"
+        content: "{{ formatted_answer.question }}"
+      - role: "assistant"
+        content: "{{ formatted_answer.answer }}"
+
+
+# ============================================================================
+# EXECUTION PARAMETERS
+# ============================================================================
+# Configure how to execute the pipeline: locally or on SLURM cluster.
+# All parameters here are optional with sensible defaults.
+
+execution_params:
+  # Execution mode: "local" or "slurm"
+  # - local: Run directly on this machine
+  # - slurm: Submit jobs to SLURM cluster
+  mode: slurm
+
+  # Whether the canonical `run` command should automatically retry failed shards.
+  # - false: submit one run only
+  # - true: submit, wait, and keep retrying failed shards until success or retry budget exhaustion
+  retry: true
+
+  # Maximum number of times to retry a failed shard (default: 3)
+  max_retries: 3
+
+  # ==========================================================================
+  # SLURM CONFIGURATION (only used when mode: slurm)
+  # ==========================================================================
+
+  # HPC account/partition to charge jobs to (REQUIRED for SLURM mode)
+  account: a127
+
+  # SLURM job name (default: "mmirage-sharded")
+  job_name: mmirage-sharded
+
+  # Optional SLURM reservation name (leave blank or omit to not use)
+  # reservation: "sai-a127"
+
+  # Number of nodes (default: 1)
+  nodes: 1
+
+  # Number of tasks per node (default: 1)
+  ntasks_per_node: 1
+
+  # Number of GPUs per node (default: 4)
+  gpus: 4
+
+  # Number of CPUs per task (default: 288)
+  cpus_per_task: 288
+
+  # Job time limit in HH:MM:SS format (default: "11:59:59")
+  time_limit: "11:59:59"
+
+  # ==========================================================================
+  # PATH CONFIGURATION
+  # ==========================================================================
+  # These support environment variables ($VAR or ${VAR}) and home directory (~)
+
+  # Project root directory (used as base for relative paths)
+  # If not set, uses current working directory
+  # project_root: "/path/to/project"
+
+  # Directory for SLURM output and error files (default: ~/reports)
+  report_dir: "/users/${USER}/reports"
+
+  # HuggingFace cache directory (default: ~/hf)
+  hf_home: "/capstor/store/cscs/swissai/a127/homes/${USER}/hf"
+
+  # EDF environment file path for cluster-specific setup
+  edf_env: "/users/${USER}/.edf/mmirage.toml"
+
+  # ==========================================================================
+  # JOB MONITORING (for "submit" and retry orchestration)
+  # ==========================================================================
+
+  # Seconds to wait between checking job status (default: 30)
+  poll_interval_seconds: 30
+
+  # Seconds to wait after job completes before checking results (default: 60)
+  # This allows filesystem to settle on distributed systems
+  settle_time_seconds: 60
+
+
+# ============================================================================
+# USAGE EXAMPLES
+# ============================================================================
+#
+# 1. Canonical entrypoint (local or SLURM; retry controlled by config):
+#    python -m mmirage.cli run --config config.yaml
+#
+# 2. Submit job to SLURM with wait for completion:
+#    python -m mmirage.cli submit --config config.yaml --wait
+#
+# 3. Submit job and get job ID back (for scripting):
+#    JOB_ID=$(python -m mmirage.cli submit --config config.yaml)
+#
+# 4. Run a single shard locally:
+#    python -m mmirage.cli run --config config.yaml --shard-id 0
+#
+# 5. Check status of all shards (and optionally submit retries):
+#    python -m mmirage.cli check --config config.yaml
diff --git a/configs/config_mock.yaml b/configs/config_mock.yaml
@@ -13,12 +13,13 @@ processors:
           enable_thinking: false
 
 loading_params:
+  state_dir: tests/output/data/_pipeline_state
   datasets:
     - path: tests/mock_data/data.jsonl
       type: JSONL
       output_dir: tests/output/data
 
-  num_shards: 4
+  num_shards: 1
   shard_id: 0
   batch_size: 64
 
@@ -47,3 +48,12 @@ processing_params:
         content: "{{ formatted_answer.question }}"
       - role: "assistant"
         content: "{{ formatted_answer.answer }}"
+
+# Execution configuration (local or SLURM cluster)
+# For local testing, use mode: local
+# For SLURM cluster, use mode: slurm and specify account
+execution_params:
+  mode: local
+  retry: false
+  report_dir: ~/reports
+  hf_home: ~/hf
diff --git a/configs/config_mock_vision.yaml b/configs/config_mock_vision.yaml
@@ -11,13 +11,14 @@ processors:
       max_new_tokens: 512
 
 loading_params:
+  state_dir: tests/output/data_vision/_pipeline_state
   datasets:
     - path: tests/mock_data_vision/data.jsonl
       type: JSONL
       output_dir: tests/output/data_vision
       image_base_path: tests/mock_data_vision  # Base directory where images are stored
 
-  num_shards: 4
+  num_shards: 1
   shard_id: 0
   batch_size: 1
 
@@ -38,3 +39,10 @@ processing_params:
   output_schema:
     image: "{{ image_input }}"
     caption: "{{ caption }}"
+
+# Execution configuration (local or SLURM cluster)
+execution_params:
+  mode: local
+  retry: false
+  report_dir: ~/reports
+  hf_home: ~/hf
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,9 @@ dev = [
   "pytest",
 ]
 
+[project.scripts]
+mmirage = "mmirage.cli:main"
+
 [tool.hatch.build.targets.wheel]
 packages = ["src/mmirage"]