EPFLiGHT
diff --git a/‎.github/workflows/docker.yml‎
Lines changed: 81 additions & 0 deletions b/‎.github/workflows/docker.yml‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 16 additions & 1 deletion b/‎README.md‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎configs/config_medtrinity.yaml‎
Lines changed: 0 additions & 51 deletions b/‎configs/config_medtrinity.yaml‎
Lines changed: 0 additions & 51 deletions
diff --git a/‎configs/config_mock.yaml‎
Lines changed: 55 additions & 0 deletions b/‎configs/config_mock.yaml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎configs/config_small.yaml‎
Lines changed: 0 additions & 46 deletions b/‎configs/config_small.yaml‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 5 additions & 0 deletions b/‎docker/Dockerfile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎run.sh‎
Lines changed: 18 additions & 8 deletions b/‎run.sh‎
Lines changed: 18 additions & 8 deletions
@@ -0,0 +1,81 @@
+name: Build and Push Docker Image
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+env:
+  IMAGE_NAME: michelducartier24/mirage
+  REGISTRY: docker.io
+
+jobs:
+  build-docker:
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - platform: ubuntu-latest
+            path: docker/Dockerfile
+            tag_base: amd64
+            name: mirage-git
+          - platform: ubuntu-24.04-arm
+            path: docker/Dockerfile
+            tag_base: arm64
+            name: mirage-git
+
+    runs-on: ${{ matrix.platform }}
+    environment: docker
+
+    steps:
+      - name: Free space (ARM)
+        if: matrix.platform == 'ubuntu-24.04-arm'
+        run: |
+          df -h
+          du -h -d1 /home/runner || true
+
+          rm -rf /opt/hostedtoolcache
+          rm -rf /home/runner/.cache
+          rm -rf /home/runner/.docker
+          rm -rf /home/runner/actions-runner/_work/_tool
+
+          df -h
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+            tool-cache: true
+            docker-images: true
+            android: true
+            dotnet: true
+            haskell: true
+            large-packages: true
+            swap-storage: true
+
+      - name: Check free space
+        run: df -h 
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ${{ matrix.path }}
+          push: true
+          tags: |
+            ${{ secrets.DOCKER_USERNAME }}/${{ matrix.name }}:latest-${{ matrix.tag_base }}
+            ${{ secrets.DOCKER_USERNAME }}/${{ matrix.name }}:${{ github.sha }}-${{ matrix.tag_base }}
@@ -6,6 +6,9 @@ __pycache__/
 # C extensions
 *.so
 
+tests/output/**
+tests/merged/**
+
 # Distribution / packaging
 .Python
 build/
@@ -160,4 +163,4 @@ cython_debug/
 #.idea/
 
 logs/
-else/
+else/
@@ -2,6 +2,21 @@
 
 MIRAGE, which stands for Multimodal Intelligent Reformatting and Augmentation Generation Engine, is an advanced platform designed to streamline the processing of datasets using generative models. It is engineered to handle large-scale data reformatting and augmentation tasks with efficiency and precision. By leveraging state-of-the-art generative models, MIRAGE enables users to perform complex dataset transformations, ensuring compatibility across various formats and schemas. Its multi-node support and parallel processing capabilities make it an ideal choice for scenarios demanding substantial computational power, such as distributed training and inference workflows. MIRAGE not only simplifies the integration of powerful language models but also provides a customizable framework for diverse use cases, from reformatting conversational datasets to generating Q/A pairs from plain text.
 
+## How to install
+
+To install the library, you can clone it from GitHub and then use pip to install it directly. It is recommended to have already installed `torch` and `sglang` to take advantage of GPU acceleration.
+
+```bash
+git clone git@github.com:EPFLiGHT/MIRAGE.git
+pip install -e ./MIRAGE
+```
+
+For testing and scripts that make use of the library, it is advised to create a .env file. You can do this by running the following command:
+```bash
+curl https://raw.githubusercontent.com/EPFLiGHT/MIRAGE/refs/heads/json-output/scripts/generate_env.sh | sh
+```
+
+
 ## Key features
 
 - Easily configurable with a YAML file which configure the following parameters
@@ -114,4 +129,4 @@ Here, we choose to output a JSON answer with 3 keys ("question", "explanation" a
 - Jinja2 to process the YAML: #[link](https://jinja.palletsprojects.com/en/stable/)
 - JMESPath: #[link](https://jmespath.org/)
 - SGLang: #[link](https://github.com/sgl-project/sglang)
-- Paper for performance drom: #[link](https://arxiv.org/abs/2408.02442)
+- Paper for performance drom: #[link](https://arxiv.org/abs/2408.02442)
@@ -0,0 +1,55 @@
+processors:
+  - type: llm
+    server_args:
+      model_path: Qwen/Qwen3-4B-Instruct-2507
+      tp_size: 1
+      disable_custom_all_reduce: true
+    sampling_params:
+      temperature: 0.1
+      top_p: 0.9
+      max_new_tokens: 1024
+      custom_params:
+        chat_template_kwargs: 
+          enable_thinking: false
+
+loading_params:
+  datasets:
+    - path: tests/mock_data/data.jsonl
+      type: JSONL
+      output_dir: tests/output/data
+    - path: 
+        train: tests/mock_data/data2/train.jsonl
+        test: tests/mock_data/data2/test.jsonl
+      type: JSONL
+      output_dir: tests/output/data2
+
+  num_shards: 4
+  shard_id: 0
+  conversations_field: "conversations"
+  batch_size: 64
+
+processing_params:
+  inputs:
+    - name: text
+      key: text
+
+  outputs:
+    - name: formatted_answer
+      type: llm
+      output_type: JSON
+      output_schema:
+        - question
+        - answer
+      prompt: |
+        Generate one question and its corresponding answer using the following text:
+        ```
+        {{ text }}
+        ```
+  
+  remove_columns: True
+  output_schema:
+    conversations:
+      - role: "user"
+        content: "{{ formatted_answer.question }}"
+      - role: "assistant"
+        content: "{{ formatted_answer.answer }}"
@@ -0,0 +1,5 @@
+FROM docker.io/lmsysorg/sglang:latest
+
+COPY . /workspace/MIRAGE
+WORKDIR /workspace/MIRAGE
+RUN pip install --no-cache-dir -e .
@@ -11,7 +11,7 @@ authors = [{ name = "Meditron team" }]
 
 # Core runtime deps for your scripts
 dependencies = [
-  "sglang[all]>=0.5.2",
+  "sglang[diffusion]>=0.5.2",
   "transformers>=4.46.0",
   "pyzmq",
   "uvloop<0.22; platform_system != 'Windows'",
@@ -34,6 +34,7 @@ dependencies = [
   "fsspec",
   "dacite>=1.6.0",
   "pydantic>=2.12",
+  "jmespath"
 ]
 
 [project.optional-dependencies]
@@ -49,4 +50,4 @@ dev = [
 packages = ["src/mirage"]
 
 [tool.hatch.build.targets.sdist]
-include = ["src/mirage/**", "pyproject.toml", "README.md"]
+include = ["src/mirage/**", "pyproject.toml", "README.md"]
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=med-sharded
+#SBATCH --job-name=mirage-example
 #SBATCH --chdir=/users/$USER/meditron/MIRAGE/src/mirage
 #SBATCH --output=/users/$USER/reports/R-%x.%A_%a.out
 #SBATCH --error=/users/$USER/reports/R-%x.%A_%a.err
@@ -8,21 +8,31 @@
 #SBATCH --gres=gpu:4
 #SBATCH --cpus-per-task=288
 #SBATCH --time=11:59:59
-#SBATCH --environment=/users/$USER/.edf/sglang.toml
 #SBATCH -A a127
-#SBATCH --array=0-31
+#SBATCH --array=0-3
 
 # --- outputs & config ---
-export ROOT=/capstor/store/cscs/swissai/a127/homes/$USER/datasets/english_small
+export ROOT=$SCRATCH/mirage_example
 export SHARDS_ROOT="$ROOT/shards"
 export MERGED_DIR="$ROOT/merged"
-export CFG=/users/$USER/MIRAGE/configs/config_small.yaml
+export CFG=/users/$USER/meditron/MIRAGE/configs/config_small.yaml
 
 # HF cache/home
-export HF_HOME=/capstor/store/cscs/swissai/a127/homes/$USER/hf
+export HF_HOME=$SCRATCH/hf
 
 mkdir -p "$SHARDS_ROOT"
 mkdir -p "$MERGED_DIR"
 
-python /users/$USER/MIRAGE/src/mirage/shard_process.py \
-  --config "$CFG"
+export CMD="python /users/$USER/meditron/MIRAGE/src/mirage/shard_process.py --config $CFG"
+
+SRUN_ARGS=" \
+  --cpus-per-task $SLURM_CPUS_PER_TASK \
+  --jobid $SLURM_JOB_ID \
+  --wait 60 \
+  -A a127 \
+  --reservation sai-a127 \
+  --environment /users/$USER/.edf/mirage.toml
+  "
+# bash -c is needed for the delayed interpolation of env vars to work
+srun $SRUN_ARGS bash -c "$CMD"
+echo "END TIME: $(date)"