MMIRAGE/configs/config_benchmark_datatrove.yaml at 1f0b9e0e0b8c172dce40dac99367138076133866 · EPFLiGHT/MMIRAGE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# MMIRAGE — DataTrove-compatible throughput benchmark
#
# Mirrors the conditions used in the DataTrove inference benchmark
# (https://github.com/huggingface/datatrove/tree/main/examples/inference/benchmark):
#
#   dataset : simplescaling/s1K-1.1  (train split, 1 000 samples)
#   prompt  : raw `question` field, no system prompt
#   output  : up to 1 024 tokens per sample
#   context : 2 048-token model max context
#   model   : Qwen/Qwen3-4B  (DataTrove baseline: tp=1 on a single GPU)
#
# Download the dataset before running:
#
#   python -c "
#   from datasets import load_dataset
#   ds = load_dataset('simplescaling/s1K-1.1', split='train')
#   ds.save_to_disk('data/s1K-1.1')
#   "
#
# Then run with stats collection enabled:
#
#   mmirage run --config configs/config_benchmark_datatrove.yaml --stats
#
# Inspect results:
#
#   mmirage stats --config configs/config_benchmark_datatrove.yaml

processors:
  - type: llm
    server_args:
      model_path: Qwen/Qwen3-4B           # same model family as DataTrove baseline
      tp_size: 1                           # DataTrove baseline: tp=1
      trust_remote_code: true
      disable_custom_all_reduce: true
      # SGLang engine tuning — equivalents of DataTrove's vLLM mns/mnbt knobs
      extra_engine_args:
        max_running_requests: 1000
    default_sampling_params:
      temperature: 0.0
      max_new_tokens: 1024                 # DataTrove: max-tokens=1024

loading_params:
  state_dir: data/benchmark_s1k/_pipeline_state
  datasets:
    - path: data/s1K-1.1                  # save_to_disk() target above
      type: loadable
      output_dir: data/benchmark_s1k/output
  num_shards: 1
  shard_id: "$SLURM_ARRAY_TASK_ID"
  batch_size: 1000

processing_params:
  inputs:
    - name: question
      key: question                        # DataTrove: prompt-column=question

  outputs:
    - name: answer
      type: llm
      output_type: plain
      # Qwen3 thinking is disabled by embedding an empty <think> block in the prompt.
      # This is equivalent to passing enable_thinking=False to the chat template and
      # avoids any dependency on SGLang sampling-param support for that flag.
      prompt: "<|im_start|>user\n{{ question }}\n<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n"

  remove_columns: false
  output_schema:
    question: "{{ question }}"
    answer: "{{ answer }}"

execution_params:
  mode: slurm
  retry: false
  merge: false
  max_retries: 3
  account: a127
  job_name: mmirage-sharded
  nodes: 1
  ntasks_per_node: 1
  gpus: 4
  cpus_per_task: 288
  time_limit: "11:59:59"
  report_dir: "/users/${USER}/reports"
  hf_home: "/capstor/store/cscs/swissai/a127/homes/${USER}/hf"
  edf_env: "/users/${USER}/.edf/mmirage.toml"
  poll_interval_seconds: 30
  settle_time_seconds: 60