diff --git a/README.md b/README.md
index 441defb60..8dae4478e 100644
--- a/README.md
+++ b/README.md
@@ -315,3 +315,10 @@ All PMR CLI commands support the following options:
 | `--script` | `-s` | None | Path to script file or directory to execute |
 
 Note that you can provide either `--command` or `--script`, but not both. When using `--script` with a directory path, all executable files in that directory will be distributed across the instances.
+
+
+# Midtraining utilities
+I (mj) built out some utilities to minimize manual labor for common tasks for midtraining.
+- [`scripts/gs2weka.py`](scripts/gs2weka.py): This script finds the latest checkpoint for a given model configuration and copies it from Google Cloud Storage to Weka storage using olmo-cookbook. Run with `python scripts/gs2weka.py <yaml_file>` to automatically detect your Beaker account and process the latest checkpoint, or use --beaker-name to specify a different account name.
+- [`scripts/convert_from_config.py`](scripts/convert_from_config.py): This script finds the latest checkpoint in Weka storage for a given model configuration and converts it to HuggingFace format using olmo-cookbook-eval. Run with `python scripts/convert_from_config.py <yaml_file>` to automatically detect your Beaker account and convert the latest checkpoint, with optional --overwrite flag to reconvert existing checkpoints.
+- [`scripts/olmo3_midtrain_eval.sh`](scripts/olmo3_midtrain_eval.sh): This script runs OLMo3 midtraining evaluations on a given checkpoint path using two different task suites (midtrain and main). Run with `bash scripts/olmo3_midtrain_eval.sh <checkpoint_path>` where the checkpoint path should point to a converted HuggingFace format checkpoint (e.g., ending in -hf).
diff --git a/pyproject.toml b/pyproject.toml
index 3f20e6ebc..5e2bf491b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ checkpoints = [
     "boto3"
 ]
 all = [
-    "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@c779ca546cc3194e73e7491aaefcdffbed042c65",
+    "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@tylerr/olmo3-scripts-swafix-foreachopt",
     "beaker-py>=1,<2",
     "GitPython>=3.0,<4.0",
     "wandb",
diff --git a/scripts/convert_from_config.py b/scripts/convert_from_config.py
new file mode 100755
index 000000000..061649591
--- /dev/null
+++ b/scripts/convert_from_config.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+"""
+Script to process YAML file and run olmo-cookbook command with latest checkpoint
+"""
+import argparse
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+import yaml
+
+
+def run_command(cmd, shell=False, errs_okay=False):
+    """Run a shell command and return stdout"""
+    try:
+        if shell:
+            result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
+        else:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command: {' '.join(cmd) if isinstance(cmd, list) else cmd}")
+        print(f"Error: {e.stderr}")
+        if not errs_okay:
+            sys.exit(1)
+        raise e
+
+
+def get_yaml_name(yaml_file):
+    """Extract the 'name' attribute from YAML file"""
+    try:
+        with open(yaml_file, "r") as f:
+            data = yaml.safe_load(f)
+
+        if "name" not in data:
+            print(f"Error: 'name' attribute not found in {yaml_file}")
+            sys.exit(1)
+
+        return data["name"]
+    except Exception as e:
+        print(f"Error reading YAML file {yaml_file}: {e}")
+        sys.exit(1)
+
+
+def get_beaker_name():
+    """Get the NAME from 'beaker account whoami' output"""
+    output = run_command(["beaker", "account", "whoami"])
+
+    # Parse the table output to extract NAME
+    lines = output.strip().split("\n")
+    if len(lines) < 2:
+        print("Error: Unexpected output from 'beaker account whoami'")
+        sys.exit(1)
+
+    # Look for the data row (skip header)
+    for line in lines[1:]:
+        parts = line.split()
+        if len(parts) >= 2:
+            return parts[1]  # NAME is the second column
+
+    print("Error: Could not extract NAME from beaker account whoami output")
+    sys.exit(1)
+
+
+def find_latest_checkpoint(beaker_name, yaml_name):
+    """Find the latest checkpoint directory in weka"""
+
+    weka_path = f"weka://oe-training-default/ai2-llm/checkpoints/{beaker_name}/{yaml_name}-*"
+
+    # Convert weka:// path to s3:// path for s5cmd
+    s3_path = weka_path.replace("weka://oe-training-default/", "s3://oe-training-default/")
+
+    # Add wildcard to check for any files in the directory
+    s3_path_wildcard = f"{s3_path}/*"
+
+    print(f"Checking if weka path exists: {weka_path}")
+    print(f"Using s5cmd to check: {s3_path_wildcard}")
+
+    cmd = [
+        "s5cmd",
+        "--profile",
+        "WEKA",
+        "--endpoint-url",
+        "https://weka-aus.beaker.org:9000",
+        "ls",
+        s3_path_wildcard,
+    ]
+
+    try:
+        output = run_command(cmd, errs_okay=True)
+        if not output:
+            print(f"No checkpoints found with prefix: {prefix}")
+            sys.exit(1)
+
+        # Get all matching paths
+        paths = output.strip().split("\n")
+
+        # Sort paths to get the latest one (lexicographically)
+        paths = [_.split(" ")[-1].strip() for _ in paths]
+        ckpts = set()
+        for p in paths:
+            re_string = yaml_name + r"-[0-9a-f]{8}/step\d+/"
+            if re.match(re_string, p):
+                ckpts.add(re.match(re_string, p).group())
+        assert (
+            len(ckpts) > 0
+        ), "No valid checkpoints found??? [this should assert should never fail if we got here to begin with]"
+        max_ckpt = max(ckpts)
+        print(max_ckpt)
+        return "weka://oe-training-default/ai2-llm/checkpoints/%s/%s" % (beaker_name, max_ckpt)
+
+    except subprocess.CalledProcessError as e:
+        print("No weka paths found!")
+        print(
+            f"Make sure you have access to weka://oe-training-deafult/ai2-llm/checkpoints/{beaker_name}/{yaml_name}-* directories"
+        )
+        raise e
+        # sys.exit(1)
+    except Exception as e:
+        print("ERR CODE ", e)
+        raise e
+
+
+def check_hf_path_exists(latest_ckpt):
+    """Check if the corresponding weka path already exists"""
+    # Convert gs:// path to weka:// path
+    hf_path = latest_ckpt.rstrip("/") + "-hf/*"
+
+    print(f"Checking if weka path exists: {hf_path}")
+    cmd = [
+        "s5cmd",
+        "--profile",
+        "WEKA",
+        "--endpoint-url",
+        "https://weka-aus.beaker.org:9000",
+        "ls",
+        hf_path,
+    ]
+
+    # Convert weka:// path to s3:// path for s5cmd
+    hf_path = hf_path.replace("weka://oe-training-default/", "s3://oe-training-default/")
+
+    print(f"Checking if weka path exists: {hf_path}")
+    cmd = [
+        "s5cmd",
+        "--profile",
+        "WEKA",
+        "--endpoint-url",
+        "https://weka-aus.beaker.org:9000",
+        "ls",
+        hf_path,
+    ]
+
+    try:
+        # Run the command - if it succeeds, the path exists
+        output = run_command(cmd, errs_okay=True)
+        print(f"✅ Weka path exists - found %s files:" % len(output.split("\n")))
+        return True
+    except subprocess.CalledProcessError as e:
+        # If the command fails, the path doesn't exist
+        print(f"❌ Weka path does not exist (s5cmd failed as expected)")
+        return False
+
+
+def run_olmo_cookbook(weka_path):
+    """Run the olmo-cookbook command with the GCS path"""
+    print("Converting %s" % weka_path)
+    weka_path = weka_path.replace("weka://", "/").rstrip("/")
+    cmd = [
+        "olmo-cookbook-eval",
+        "convert",
+        weka_path,
+        "-t",
+        "olmo-core-v2",
+        "--use-beaker",
+        "--huggingface-transformers-git-url",
+        "https://github.com/2015aroras/transformers.git",
+        "--huggingface-transformers-commit-hash",
+        "ae3889ced6ed7362e5883671fc6dc4cb4fece5fa",
+        "--olmo-core-v2-commit-hash",
+        "57a04d0b69047d797c96eede056a211e75b5914a",
+    ]
+    print(f"Running: {' '.join(cmd)}")
+
+    try:
+        # Run the command and stream output in real-time
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True
+        )
+
+        beaker_url = None
+        beaker_url_pattern = re.compile(r"https://beaker\.org/ex/[A-Z0-9]+")
+
+        for line in process.stdout:
+            print(line, end="")
+
+            # Look for the beaker URL in the output
+            match = beaker_url_pattern.search(line)
+            if match:
+                beaker_url = match.group(0)
+
+        process.wait()
+
+        if process.returncode != 0:
+            print(f"Error: olmo-cookbook command failed with return code {process.returncode}")
+            sys.exit(1)
+
+        # Print the extracted Beaker URL
+        if beaker_url:
+            print(f"\n" + "=" * 60)
+            print(f"🔗 Beaker Experiment URL: {beaker_url}")
+            print(f"=" * 60)
+            return beaker_url
+        else:
+            print("\nWarning: Could not extract Beaker experiment URL from output")
+
+    except Exception as e:
+        print(f"Error running olmo-cookbook: {e}")
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process YAML file and run olmo-cookbook with latest checkpoint")
+    parser.add_argument("yaml_file", help="Path to the YAML file")
+    parser.add_argument("--beaker-name", required=False, default=None)
+    parser.add_argument("--overwrite", required=False, type=bool, default=False)
+    args = parser.parse_args()
+
+    # Validate input file exists
+    if not Path(args.yaml_file).exists():
+        print(f"Error: YAML file {args.yaml_file} does not exist")
+        sys.exit(1)
+
+    print(f"Processing YAML file: {args.yaml_file}")
+
+    # Step 1: Get name from YAML
+    yaml_name = get_yaml_name(args.yaml_file)
+    print(f"YAML name: {yaml_name}")
+
+    # Step 2: Get beaker name
+    if args.beaker_name == None:
+        beaker_name = get_beaker_name()
+    else:
+        beaker_name = args.beaker_name
+    print(f"Beaker name: {beaker_name}")
+
+    # Step 3: Find latest checkpoint
+    print(
+        f"Searching for checkpoints with prefix: weka://oe-training-default/ai2-llm/checkpoints/{beaker_name}/{yaml_name}-"
+    )
+    latest_checkpoint = find_latest_checkpoint(beaker_name, yaml_name)
+    print(f"Latest checkpoint: {latest_checkpoint}")
+
+    # Step 4: Check if weka path already exists
+    if check_hf_path_exists(latest_checkpoint) and not args.overwrite:
+        print(f"\n🚫 Converted checkpoint already exists in weka storage. Skipping cookbook command.")
+        print(f"The checkpoint has already been copied to weka://oe-training-default/")
+        return
+
+    # Step 5: Run olmo-cookbook command
+    run_olmo_cookbook(latest_checkpoint)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/gs2weka.py b/scripts/gs2weka.py
new file mode 100644
index 000000000..91cd23067
--- /dev/null
+++ b/scripts/gs2weka.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Script to process YAML file and run olmo-cookbook command with latest checkpoint
+"""
+import argparse
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+import yaml
+from google.auth.exceptions import DefaultCredentialsError
+from google.cloud import storage
+
+
+def run_command(cmd, shell=False, errs_okay=False):
+    """Run a shell command and return stdout"""
+    try:
+        if shell:
+            result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
+        else:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command: {' '.join(cmd) if isinstance(cmd, list) else cmd}")
+        print(f"Error: {e.stderr}")
+        if not errs_okay:
+            sys.exit(1)
+        raise e
+
+
+def get_yaml_name(yaml_file):
+    """Extract the 'name' attribute from YAML file"""
+    try:
+        with open(yaml_file, "r") as f:
+            data = yaml.safe_load(f)
+
+        if "name" not in data:
+            print(f"Error: 'name' attribute not found in {yaml_file}")
+            sys.exit(1)
+
+        return data["name"]
+    except Exception as e:
+        print(f"Error reading YAML file {yaml_file}: {e}")
+        sys.exit(1)
+
+
+def get_beaker_name():
+    """Get the NAME from 'beaker account whoami' output"""
+    output = run_command(["beaker", "account", "whoami"])
+
+    # Parse the table output to extract NAME
+    lines = output.strip().split("\n")
+    if len(lines) < 2:
+        print("Error: Unexpected output from 'beaker account whoami'")
+        sys.exit(1)
+
+    # Look for the data row (skip header)
+    for line in lines[1:]:
+        parts = line.split()
+        if len(parts) >= 2:
+            return parts[1]  # NAME is the second column
+
+    print("Error: Could not extract NAME from beaker account whoami output")
+    sys.exit(1)
+
+
+def find_latest_checkpoint(beaker_name, yaml_name):
+    """Find the latest checkpoint directory in GCS"""
+    # Construct the GCS path prefix (without gs://)
+    bucket_name = "ai2-llm"
+    prefix = f"checkpoints/{beaker_name}/{yaml_name}-"
+
+    try:
+        # Initialize the GCS client
+        client = storage.Client(project="ai2-allennlp")
+        bucket = client.bucket(bucket_name)
+
+        print(f"Searching for checkpoints with prefix: gs://{bucket_name}/{prefix}")
+
+        # List all blobs with the prefix
+        blobs = bucket.list_blobs(prefix=prefix)
+
+        # Find paths that match the pattern: prefix*/step*/
+        checkpoint_paths = []
+        for blob in blobs:
+            # Split the blob name into parts
+            parts = blob.name.split("/")
+
+            # Check if this looks like a checkpoint directory structure
+            # We want: checkpoints/{beaker_name}/{yaml_name}-{something}/step{something}/
+            if len(parts) >= 4:  # At least checkpoints/beaker/yaml-*/step*/
+                # Check if there's a step directory in the path
+                for i, part in enumerate(parts):
+                    if part.startswith("step") and i < len(parts) - 1:
+                        # Construct the directory path up to and including the step directory
+                        step_dir_path = "/".join(parts[: i + 1])
+                        full_path = f"gs://{bucket_name}/{step_dir_path}"
+
+                        if full_path not in checkpoint_paths:
+                            checkpoint_paths.append(full_path)
+                        break
+
+        if not checkpoint_paths:
+            print(f"No checkpoints found with prefix: gs://{bucket_name}/{prefix}")
+            sys.exit(1)
+
+        # Sort paths to get the latest one (lexicographically)
+        checkpoint_paths.sort(reverse=True)
+
+        print(f"Found {len(checkpoint_paths)} checkpoint directories")
+        print(f"Latest checkpoint: {checkpoint_paths[0]}")
+
+        return checkpoint_paths[0]
+
+    except DefaultCredentialsError:
+        print("Error: Google Cloud credentials not found.")
+        print("Please set up authentication:")
+        print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable:")
+        print("   export GOOGLE_APPLICATION_CREDENTIALS='/path/to/service-account-key.json'")
+        print("2. Or run: gcloud auth application-default login")
+        sys.exit(1)
+
+    except Exception as e:
+        print(f"Error listing GCS directories: {e}")
+        print(f"Make sure you have access to gs://{bucket_name}/{prefix}* directories")
+        sys.exit(1)
+
+
+# Alternative implementation that's more efficient for large buckets
+def find_latest_checkpoint_optimized(beaker_name, yaml_name):
+    """
+    More efficient version that uses prefix listing to avoid scanning all objects.
+    This works better when you have many files in the bucket.
+    """
+    bucket_name = "ai2-llm"
+    prefix = f"checkpoints/{beaker_name}/{yaml_name}-"
+
+    try:
+        client = storage.Client()
+        bucket = client.bucket(bucket_name)
+
+        # Get all "directories" by using delimiter
+        # This is more efficient as it doesn't list individual files
+        blobs = bucket.list_blobs(prefix=prefix, delimiter="/")
+
+        # Collect all run directories (yaml_name-*)
+        run_prefixes = []
+        for page in blobs.pages:
+            run_prefixes.extend(page.prefixes)
+
+        if not run_prefixes:
+            print(f"No run directories found with prefix: gs://{bucket_name}/{prefix}")
+            sys.exit(1)
+
+        # For each run directory, find step directories
+        checkpoint_paths = []
+        for run_prefix in run_prefixes:
+            step_blobs = bucket.list_blobs(prefix=run_prefix, delimiter="/")
+
+            step_prefixes = []
+            for page in step_blobs.pages:
+                step_prefixes.extend(page.prefixes)
+
+            # Filter for step directories
+            for step_prefix in step_prefixes:
+                if "/step" in step_prefix:
+                    full_path = f"gs://{bucket_name}/{step_prefix.rstrip('/')}"
+                    checkpoint_paths.append(full_path)
+
+        if not checkpoint_paths:
+            print(f"No step directories found in runs matching: gs://{bucket_name}/{prefix}")
+            sys.exit(1)
+
+        # Sort paths to get the latest one
+        checkpoint_paths.sort(reverse=True)
+
+        print(f"Found {len(checkpoint_paths)} checkpoint directories")
+        print(f"Latest checkpoint: {checkpoint_paths[0]}")
+
+        return checkpoint_paths[0]
+
+    except Exception as e:
+        print(f"Error listing GCS directories: {e}")
+        print(f"Make sure you have access to gs://{bucket_name}/{prefix}* directories")
+        sys.exit(1)
+
+
+def check_weka_path_exists(gs_path):
+    """Check if the corresponding weka path already exists"""
+    # Convert gs:// path to weka:// path
+    if not gs_path.startswith("gs://"):
+        print(f"Error: Expected gs:// path, got: {gs_path}")
+        return False
+
+    weka_path = gs_path.replace("gs://", "weka://oe-training-default/")
+
+    # Convert weka:// path to s3:// path for s5cmd
+    s3_path = weka_path.replace("weka://oe-training-default/", "s3://oe-training-default/")
+
+    # Add wildcard to check for any files in the directory
+    s3_path_wildcard = f"{s3_path}/*"
+
+    print(f"Checking if weka path exists: {weka_path}")
+    print(f"Using s5cmd to check: {s3_path_wildcard}")
+
+    cmd = [
+        "s5cmd",
+        "--profile",
+        "WEKA",
+        "--endpoint-url",
+        "https://weka-aus.beaker.org:9000",
+        "ls",
+        s3_path_wildcard,
+    ]
+
+    try:
+        # Run the command - if it succeeds, the path exists
+        output = run_command(cmd, errs_okay=True)
+
+        print(f"✅ Weka path exists - found %s files:" % len(output.split("\n")))
+        # print(output)
+        return True
+
+    except subprocess.CalledProcessError as e:
+        # If the command fails, the path doesn't exist
+        print(f"❌ Weka path does not exist (s5cmd failed as expected)")
+        return False
+    except Exception as e:
+        print("ERR CODE", e)
+        raise e
+
+
+def run_olmo_cookbook(gs_path):
+    """Run the olmo-cookbook command with the GCS path"""
+    weka_path = gs_path.replace("gs://", "weka://oe-training-default/")
+
+    cmd = ["python", "-m", "cookbook.remote", gs_path, weka_path]
+
+    print(f"Running: {' '.join(cmd)}")
+
+    try:
+        # Run the command and stream output in real-time
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True
+        )
+
+        beaker_url = None
+        beaker_url_pattern = re.compile(r"https://beaker\.org/ex/[A-Z0-9]+")
+
+        for line in process.stdout:
+            print(line, end="")
+
+            # Look for the beaker URL in the output
+            match = beaker_url_pattern.search(line)
+            if match:
+                beaker_url = match.group(0)
+
+        process.wait()
+
+        if process.returncode != 0:
+            print(f"Error: olmo-cookbook command failed with return code {process.returncode}")
+            sys.exit(1)
+
+        # Print the extracted Beaker URL
+        if beaker_url:
+            print(f"\n" + "=" * 60)
+            print(f"🔗 Beaker Experiment URL: {beaker_url}")
+            print(f"=" * 60)
+            return beaker_url
+        else:
+            print("\nWarning: Could not extract Beaker experiment URL from output")
+
+    except Exception as e:
+        print(f"Error running olmo-cookbook: {e}")
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process YAML file and run olmo-cookbook with latest checkpoint")
+    parser.add_argument("yaml_file", help="Path to the YAML file")
+    parser.add_argument("--beaker-name", required=False, default=None)
+    args = parser.parse_args()
+
+    # Validate input file exists
+    if not Path(args.yaml_file).exists():
+        print(f"Error: YAML file {args.yaml_file} does not exist")
+        sys.exit(1)
+
+    print(f"Processing YAML file: {args.yaml_file}")
+
+    # Step 1: Get name from YAML
+    yaml_name = get_yaml_name(args.yaml_file)
+    print(f"YAML name: {yaml_name}")
+
+    # Step 2: Get beaker name
+    if args.beaker_name == None:
+        beaker_name = get_beaker_name()
+    else:
+        beaker_name = args.beaker_name
+    print(f"Beaker name: {beaker_name}")
+
+    # Step 3: Find latest checkpoint
+    print(f"Searching for checkpoints with prefix: gs://ai2-llm/checkpoints/{beaker_name}/{yaml_name}-")
+    latest_checkpoint = find_latest_checkpoint(beaker_name, yaml_name)
+    print(f"Latest checkpoint: {latest_checkpoint}")
+
+    # Step 4: Check if weka path already exists
+    if check_weka_path_exists(latest_checkpoint):
+        print(f"\n🚫 Checkpoint already exists in weka storage. Skipping cookbook command.")
+        print(f"The checkpoint has already been copied to weka://oe-training-default/")
+        return
+
+    # Step 5: Run olmo-cookbook command
+    run_olmo_cookbook(latest_checkpoint)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/olmo3_midtrain_eval.sh b/scripts/olmo3_midtrain_eval.sh
new file mode 100644
index 000000000..258b1764f
--- /dev/null
+++ b/scripts/olmo3_midtrain_eval.sh
@@ -0,0 +1,48 @@
+
+# Check if first argument is provided
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 <checkpoint_path> <dashboard>"
+    echo "Example: $0 /oe-training-default/ai2-llm/checkpoints/mattj/microanneal-dolminos_math_baseline-1B-ffabe337/step477-hf"
+    exit 1
+fi
+
+# Store the first argument in a variable
+CHECKPOINT_PATH="$1"
+DASHBOARD="$2"
+
+olmo-cookbook-eval evaluate \
+  "$CHECKPOINT_PATH" \
+  --tasks olmo3:dev:midtrain:v1 \
+  --priority high \
+  --cluster aus80g \
+  --num-gpus 1 \
+  --partition-size 8 \
+  --model-backend vllm \
+  --no-compute-gold-bpb \
+  --model-args chat_template=basic_answer,trust_remote_code=true,max_length=8192 \
+  --use-gantry \
+  --gantry-args env-secret="OPENAI_API_KEY=openai_api_key" \
+  --task-args chat_overrides="{\"generation_kwargs\": {\"stop_sequences\": [\"Problem:\", \"Answer:\", \"Question:\", \"</s>\", \"<|eot_id|>\"]}}" \
+  --fim-tokens l2c \
+  --oe-eval-branch davidh/olmo3 \
+  --beaker-image oe-eval-beaker/oe_eval_olmo3_auto \
+  --vllm-use-v1-spec \
+  --dashboard $DASHBOARD \
+  --workspace ai2/olmo-3-microanneals
+
+olmo-cookbook-eval evaluate \
+  "$CHECKPOINT_PATH" \
+  --tasks olmo3:dev:7b:main:v2 \
+  --priority high \
+  --cluster aus80g \
+  --partition-size 8 \
+  --num-gpus 1 \
+  --model-backend vllm \
+  --model-args trust_remote_code=true,max_length=4096 \
+  --beaker-image oe-eval-beaker/oe_eval_olmo3_auto\
+  --fim-tokens l2c \
+  --vllm-use-v1-spec \
+  --vllm-memory-utilization 0.7 \
+  --dashboard $DASHBOARD \
+  --workspace ai2/olmo-3-microanneals
+
diff --git a/src/cookbook/cli/eval.py b/src/cookbook/cli/eval.py
index 3ab70c506..e02e37785 100644
--- a/src/cookbook/cli/eval.py
+++ b/src/cookbook/cli/eval.py
@@ -6,8 +6,8 @@
 
 import click
 from rich.console import Console
-from rich.table import Table
 from rich.pretty import pprint
+from rich.table import Table
 
 from cookbook.cli.utils import (
     get_aws_access_key_id,
@@ -26,10 +26,10 @@
     TRANSFORMERS_COMMIT_HASH,
     TRANSFORMERS_GIT_URL,
 )
-from cookbook.eval.named_tasks import BaseNamedTasksGroup, NamedTasksGroupRegistry
 from cookbook.eval.conversion import run_checkpoint_conversion
 from cookbook.eval.datalake import AddToDashboard, FindExperiments, RemoveFromDashboard
 from cookbook.eval.evaluation import evaluate_checkpoint
+from cookbook.eval.named_tasks import BaseNamedTasksGroup, NamedTasksGroupRegistry
 from cookbook.eval.results import make_dashboard_table, print_missing_tasks
 
 logger = logging.getLogger(__name__)
@@ -428,8 +428,8 @@ def evaluate_model(
             dashboard,
             model_name,
             tasks,
-            format='return_missing',
-            sort_by='avg',
+            format="return_missing",
+            sort_by="avg",
             sort_column_name=None,
             sort_descending=None,
             force=False,
@@ -440,7 +440,7 @@ def evaluate_model(
         if model_name in missing_tasks:
             tasks = missing_tasks[model_name]
         else:
-            print(f'Found no missing tasks for {model_name}')
+            print(f"Found no missing tasks for {model_name}")
             return
 
     evaluate_checkpoint(
@@ -545,7 +545,6 @@ def get_results(
     force: bool,
     skip_on_fail: bool,
 ) -> None:
-
     # compile tasks names into regex patterns (if possible)
     compiled_tasks = [re.compile(task) if re.escape(task) != task else task for task in tasks]
 
@@ -625,7 +624,7 @@ def get_results(
         columns_filter_tasks=columns_filter_tasks,
     )
 
-    if format == 'return_missing':
+    if format == "return_missing":
         return missing_tasks
 
     # okay we got all results! now time to sort them depending on the user's request
diff --git a/src/cookbook/data/dataset.py b/src/cookbook/data/dataset.py
index 8ba291d96..ff86f5016 100644
--- a/src/cookbook/data/dataset.py
+++ b/src/cookbook/data/dataset.py
@@ -43,10 +43,16 @@ def build(self) -> SourceMixtureDatasetConfig:
             schemes = {urlparse(path).scheme for path in paths + globs}
             if len(schemes) > 1:
                 raise ValueError(f"All paths for source {source.name} must have the same scheme. Found: {schemes}")
+            elif len(schemes) == 0:
+                raise ValueError(f"No paths found for source {source.name}")
 
             scheme = schemes.pop()
 
             expanded = paths + expand_globs(self.cached_fs.get(scheme, self.cached_fs["s3"]), globs)
+
+            if len(expanded) == 0:
+                raise ValueError(f"No paths found for source {source.name}")
+            
             source_configs.append(
                 SourceMixtureConfig(
                     source_name=source.name,
diff --git a/src/cookbook/eval/datalake.py b/src/cookbook/eval/datalake.py
index cf4c80a0e..b9f19d0d6 100644
--- a/src/cookbook/eval/datalake.py
+++ b/src/cookbook/eval/datalake.py
@@ -148,7 +148,6 @@ class FindExperiments(BaseDatalakeItem):
 
     @classmethod
     def run(cls, dashboard: str | None = None, model_name: str | None = None, limit: int = 10_000) -> list[Self]:
-
         # make sure at least one of dashboard or model_name is provided
         assert dashboard or model_name, "Either dashboard or model_name must be provided"
         response = requests.get(
@@ -239,9 +238,8 @@ def model_path(self) -> str | None:
     @property
     def model_name(self) -> str | None:
         model_name = self.model_config.get("model", None)
-        if 'revision' in self.model_config and \
-            self.model_config['revision'] is not None:
-            model_name = model_name + ':' + self.model_config['revision']
+        if "revision" in self.model_config and self.model_config["revision"] is not None:
+            model_name = model_name + ":" + self.model_config["revision"]
         return model_name
 
     @property
@@ -304,7 +302,6 @@ def run(cls, model_name: str, dashboard: str, fuzzy: bool = False) -> List[Self]
 
         fns = []
         for run in runs:
-
             # if the experiment is in the cache, we remove it since we changed its tags
             cache.delete(experiment_id=run.experiment_id)
 
diff --git a/src/cookbook/eval/evaluation.py b/src/cookbook/eval/evaluation.py
index fbdef5110..88e91862d 100644
--- a/src/cookbook/eval/evaluation.py
+++ b/src/cookbook/eval/evaluation.py
@@ -1,13 +1,17 @@
 import json
+import os
 import re
 import shlex
 import subprocess
+import sys
+from collections.abc import Mapping
 from copy import deepcopy
 from hashlib import md5
-import sys
 from typing import Optional
 from urllib.parse import urlparse
 
+from rich.pretty import pprint
+
 from cookbook.cli.utils import (
     PythonEnv,
     add_aws_flags,
@@ -15,13 +19,9 @@
     install_oe_eval,
     make_eval_run_name,
 )
-from cookbook.constants import (
-    BEAKER_KNOWN_CLUSTERS,
-    FIM_TOKENS,
-    OE_EVAL_LAUNCH_COMMAND,
-    WEKA_MOUNTS,
-)
+from cookbook.constants import BEAKER_KNOWN_CLUSTERS, FIM_TOKENS, OE_EVAL_LAUNCH_COMMAND, WEKA_MOUNTS
 from cookbook.eval.named_tasks import NamedTasksGroupRegistry
+from cookbook.eval.results import make_dashboard_table
 
 
 def evaluate_checkpoint(
@@ -184,29 +184,23 @@ def evaluate_checkpoint(
 
     # these are all the tasks we want to run; note that we can't run regex patterns here,
     # they have to be actual strings
-    all_tasks_set = set()
-    for task_group in tasks:
-        try:
-            # this is a task group! the get function will return a class that has an expanded_tasks attribute
-            all_tasks_set.update(NamedTasksGroupRegistry.get(task_group).expanded_tasks)
-        except ValueError:
-            # actually not a task group, just a task name. append as is.
-            all_tasks_set.add(task_group)
-
-    # we finish by sorting the tasks
-    all_tasks = sorted(all_tasks_set)
-
-    # @davidh: we have a few specific tasks that are not implemented in oe-eval as standalone tasks
-    # @soldni: to clarify: this is fine, since these tasks are computed anyway as part of the non-bpb version,
-    #          it's just the task alias that does not exist.
-    EXCLUDE_FROM_LAUNCH = [
-        r'^mmlu_.*:bpb::olmes$',
-        r'^lambada:bpb$'
-    ]
-    all_tasks = [
-        task for task in all_tasks
-        if not any(re.match(pattern, task) for pattern in EXCLUDE_FROM_LAUNCH)
-    ]
+    all_tasks = sorted(
+        list(
+            set(
+                task
+                for task_group in tasks
+                for task in NamedTasksGroupRegistry.get(task_group).expanded_tasks
+                if isinstance(task, str)
+            )
+        )
+    )
+
+    print("Launching evals on the following tasks:")
+    pprint(all_tasks)
+
+    # @davidh we have a few specific tasks that are not implemented in oe-eval as standalone tasks
+    EXCLUDE_FROM_LAUNCH = [r"^mmlu_.*:bpb::olmes$", r"^lambada:bpb$"]
+    all_tasks = [task for task in all_tasks if not any(re.match(pattern, task) for pattern in EXCLUDE_FROM_LAUNCH)]
 
     # DOING SOME PRETTY PRINTING HERE #
     print(
@@ -323,8 +317,11 @@ def evaluate_checkpoint(
                 if "stop_sequences" in partition_task_args["generation_kwargs"]:
                     # Add the stop tokens if they do not exist
                     partition_task_args["generation_kwargs"]["stop_sequences"].extend(
-                        [stop_tok for stop_tok in infilling_dict["generation_kwargs"]["stop_sequences"]
-                         if stop_tok not in partition_task_args["generation_kwargs"]["stop_sequences"]]
+                        [
+                            stop_tok
+                            for stop_tok in infilling_dict["generation_kwargs"]["stop_sequences"]
+                            if stop_tok not in partition_task_args["generation_kwargs"]["stop_sequences"]
+                        ]
                     )
                 else:
                     partition_task_args["generation_kwargs"].update(infilling_dict["generation_kwargs"])
diff --git a/src/cookbook/model/builder.py b/src/cookbook/model/builder.py
index d762c1c5d..bc08e50d9 100644
--- a/src/cookbook/model/builder.py
+++ b/src/cookbook/model/builder.py
@@ -16,19 +16,24 @@
     NumpyDatasetType,
     TokenizerConfig,
 )
+from olmo_core.nn.attention import SlidingWindowAttentionConfig
 from olmo_core.data.types import NumpyDatasetDType
 from olmo_core.distributed.parallel import DataParallelType
-from olmo_core.float8 import Float8Config
+from olmo_core.float8 import AOFloat8LinearConfig, Float8Config
 from olmo_core.io import resource_path
 from olmo_core.nn.transformer import TransformerConfig
 from olmo_core.optim import (
-    CosWithWarmup,
     OptimConfig,
     OptimGroupOverride,
-    Scheduler,
     SkipStepAdamWConfig,
 )
-from olmo_core.optim.scheduler import CosWithWarmupAndLinearDecay, LinearWithWarmup
+from olmo_core.optim.scheduler import (
+    WSD,
+    CosWithWarmup,
+    CosWithWarmupAndLinearDecay,
+    LinearWithWarmup,
+    Scheduler,
+)
 from olmo_core.train import Duration, TrainerConfig
 from olmo_core.train.callbacks import (
     BeakerCallback,
@@ -66,7 +71,6 @@
     WrappedTransformerConfig,
 )
 from cookbook.model.evaluators import DownstreamEvaluator, get_tasks_for_groups
-from cookbook.model.schedulers import WSD
 
 logger = logging.getLogger(__name__)
 
@@ -195,6 +199,7 @@ class TransformerConfigBuilder:
     activation_checkpointing: bool
     annealing: Optional[AnnealConfig] = None
     profile: bool = False
+    shard_degree: Optional[int] = None
 
     def __init__(
         self,
@@ -215,6 +220,7 @@ def __init__(
         lm_evaluator: bool,
         downstream_evaluators: List[DownstreamEvaluator],  # type: ignore
         scheduler_type: SchedulerType,
+        shard_degree: Optional[int] = None,
         activation_checkpointing: bool = False,
         model_overrides: Optional[List[str]] = None,
         load_path_fs: Optional[Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] = None,
@@ -229,6 +235,7 @@ def __init__(
         seed: int = 42,
         warmup_steps: Optional[int] = None,
         profile: bool = False,
+        float8_enabled: bool = True,
     ):
         self.run_name = run_name
         self.sources = sources
@@ -266,6 +273,9 @@ def __init__(
         self.checkpoint_dir = f"{self.data_dir}/checkpoints/{self.beaker_user.lower()}/{self.run_name}"
         self.eval_interval = eval_interval
         self.cluster = cluster
+        self.float8_enabled = float8_enabled
+        self.cancel_check_interval = 50
+        self.shard_degree = shard_degree
 
         if any(substring in cluster for substring in ["augusta"]):
             self.root_dir = "gs://ai2-llm"
@@ -338,8 +348,8 @@ def build_callbacks(self) -> Dict[str, Callback]:
         callbacks = {
             "checkpointer": CheckpointerCallback(
                 save_interval=self.save_interval,
-                ephemeral_save_interval=100,
-                save_async=True,
+                ephemeral_save_interval=None,
+                save_async=False,
             ),
             "config_saver": ConfigSaverCallback(),
             "profiler": ProfilerCallback(enabled=self.profile),
@@ -368,7 +378,7 @@ def build_callbacks(self) -> Dict[str, Callback]:
                     project=self.metrics_config.project.strip(),
                     entity=self.metrics_config.entity.strip(),
                     group=self.group_id.strip(),
-                    cancel_check_interval=10,
+                    cancel_check_interval=self.cancel_check_interval,
                     enabled=True,
                 )
             if MetricBackend.comet in self.metrics_config.backends:
@@ -385,7 +395,7 @@ def build_callbacks(self) -> Dict[str, Callback]:
                     workspace=self.metrics_config.workspace.strip(),
                     project=self.metrics_config.project.strip(),
                     enabled=True,
-                    cancel_check_interval=10,
+                    cancel_check_interval=self.cancel_check_interval,
                 )
 
         if self.lm_evaluator:
@@ -412,7 +422,7 @@ def build_callbacks(self) -> Dict[str, Callback]:
 
         return callbacks
 
-    def build_dataset_config(self, loader_processes: int = 16) -> NumpyDatasetConfig:
+    def build_dataset_config(self, loader_processes: int = 8) -> NumpyDatasetConfig:
         is_fractional = any(source.ratio is not None and source.ratio != 1 for source in self.sources)
 
         mixture_config = None
@@ -475,14 +485,14 @@ def get_optimizer_config(self) -> OptimConfig:
             lr=lr,
             weight_decay=0.033,
             betas=(0.9, 0.95),
+            foreach=True,
             group_overrides=[OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0))],
         )
 
     def get_ac_config(self):
-        # NOTE: This is pretty broad, we can make this more fine-grained if we find it useful
         return TransformerActivationCheckpointingConfig(
             mode=TransformerActivationCheckpointingMode.selected_modules,
-            modules=["blocks.*.feed_forward"],
+            modules=[f"blocks.{i}.feed_forward" for i in range(0, 64, 4)],
         )
 
     def load_state_and_config_from_path(self) -> Tuple[Path, Path]:
@@ -511,6 +521,16 @@ def load_state_and_config_from_path(self) -> Tuple[Path, Path]:
                 resource_path(folder=self.load_path, fname="config.json"),
             )
 
+    def get_fp8_config(self) -> Float8Config:
+        return Float8Config(
+            enabled=self.float8_enabled,
+            ao=AOFloat8LinearConfig(
+                enable_fsdp_float8_all_gather=True,
+                force_recompute_fp8_weight_in_bwd=True,
+                round_scales_to_power_of_2=True,
+            ),
+        )
+
     def get_state_from_checkpoint(self) -> SchedulerState:
         state_path, config_path = self.load_state_and_config_from_path()
         train_state = torch.load(state_path, weights_only=False)
@@ -539,30 +559,47 @@ def get_state_from_checkpoint(self) -> SchedulerState:
 
         try:
             # Try olmo_core v2 config format first
-            base_lr: int = config["optim"]["lr"]
+            base_lr: int = config["train_module"]["optim"]["lr"]
             scheduler_config = config["train_module"]["scheduler"]
-        except KeyError as e:
+        except KeyError:
             # Now try olmo_core v1 config format
             try:
                 base_lr: int = config["optim"]["lr"]
                 scheduler_config = config["trainer"]["callbacks"]["lr_scheduler"]["scheduler"]
-            except KeyError as e:
+            except Exception as e:
                 logger.error(
-                    "Could not find base_lr or scheduler config in train state. Please ensure the checkpoint is valid. Unable to load scheduler state."
+                    "Could not find base_lr or scheduler config in train state. Please ensure the checkpoint is valid. Unable to load scheduler state.",
+                    e,
                 )
                 raise e
 
         scheduler_class = scheduler_config.pop("_CLASS_").split(".")[-1]
 
         try:
-            assert scheduler_class == CosWithWarmup.__name__
+            assert scheduler_class == CosWithWarmup.__name__ or scheduler_class == WSD.__name__
         except AssertionError as e:
             logger.error(
-                f"Expected scheduler class {CosWithWarmup.__name__}, but got {scheduler_class}: Anneals from a base LR can only be inferred from CosWithWarmup scheduler."
+                f"Expected scheduler class {CosWithWarmup.__name__} or {WSD.__name__}, but got {scheduler_class}: Anneals from a base LR cannot be inferred from this scheduler type. Exiting!"
             )
             raise e
 
-        scheduler = CosWithWarmup(**scheduler_config)
+        try:
+            if scheduler_class == WSD.__name__:
+                if not scheduler_config.get("decay_fraction", None):
+                    scheduler_config["decay_fraction"] = None
+                scheduler = WSD(**scheduler_config)
+            elif scheduler_class == CosWithWarmup.__name__:
+                scheduler = CosWithWarmup(**scheduler_config)
+            else:
+                raise ValueError(f"Unsupported scheduler class: {scheduler_class}")
+        except Exception as e:
+            logger.error(
+                "Could not instantiate scheduler from config. Please ensure the checkpoint is valid. Unable to load scheduler state.",
+                e,
+            )
+            logger.info(scheduler_config)
+            raise e
+
         starting_lr = float(scheduler.get_lr(base_lr, last_pretrain_step, max_pretrain_steps))
 
         return SchedulerState(
@@ -596,10 +633,14 @@ def build(self) -> ModelTrainConfig:
             optim=self.get_optimizer_config(),
             compile_model=True,
             dp_config=train_module.TransformerDataParallelConfig(
-                name=DataParallelType.hsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32
+                name=DataParallelType.hsdp,
+                param_dtype=DType.bfloat16,
+                reduce_dtype=DType.float32,
+                wrapping_strategy=train_module.TransformerDataParallelWrappingStrategy.blocks,
+                shard_degree=self.shard_degree,
             ),
             ac_config=self.get_ac_config() if self.activation_checkpointing else None,
-            float8_config=Float8Config(enabled=False),
+            float8_config=self.get_fp8_config(),
             z_loss_multiplier=1e-5,
             max_grad_norm=1.0,
             scheduler=self.get_scheduler_config(),
@@ -613,7 +654,7 @@ def build(self) -> ModelTrainConfig:
             work_dir=self.dataset_cache,
             save_overwrite=True,
             metrics_collect_interval=10,
-            cancel_check_interval=5,
+            cancel_check_interval=self.cancel_check_interval,
             max_duration=Duration.tokens(self.max_tokens),
         )
 
@@ -627,6 +668,16 @@ def build(self) -> ModelTrainConfig:
 
             self.transformer_config = self.transformer_config.merge(dotlist=self.model_overrides)
 
+        # TODO(undfined): The hax once swafix is not an issue anymore
+        if self.model_identifier == "olmo2_7B_swafix":
+            self.transformer_config.block.attention.sliding_window = SlidingWindowAttentionConfig(
+                force_full_attention_on_first_layer=False,
+                force_full_attention_on_last_layer=True,
+                pattern=[4096, 4096, 4096, -1],
+            )
+            self.transformer_config.block.attention.use_flash = True
+            self.transformer_config.block.attention.use_head_qk_norm = True
+
         return ModelTrainConfig(
             init_seed=self.seed,
             model=self.transformer_config,
diff --git a/src/cookbook/model/config.py b/src/cookbook/model/config.py
index 373c04c75..457ab2841 100644
--- a/src/cookbook/model/config.py
+++ b/src/cookbook/model/config.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Optional
 
 import olmo_core.train.train_module as train_module
 from olmo_core.config import Config
@@ -187,6 +186,18 @@ def olmo_30m(cls, tokenizer: TokenizerConfig) -> TransformerConfig:
             block_name=DefaultTransformerProperties.block_type,
         )
 
+    @classmethod
+    def olmo2_7B_swafix(cls, tokenizer: TokenizerConfig) -> TransformerConfig:
+        """
+        OLMo2 7B with SWA fix changes
+        """
+        return getattr(TransformerConfig, "olmo2_7B")(
+            vocab_size=tokenizer.padded_vocab_size(),
+            n_kv_heads=8,
+            hidden_size_multiplier=1.2,
+            hidden_size_multiple_of=1024,
+        )
+
     @classmethod
     def from_model_identifier(
         cls,
diff --git a/src/cookbook/model/schedulers.py b/src/cookbook/model/schedulers.py
deleted file mode 100644
index 09d03bcd2..000000000
--- a/src/cookbook/model/schedulers.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, Union
-
-import torch
-from olmo_core.exceptions import OLMoConfigurationError
-from olmo_core.optim import Scheduler
-
-
-@dataclass
-# NOTE: Temporary port from https://github.com/allenai/OLMo-core/blob/dirkg/DenseExperiments/src/olmo_core/optim/scheduler.py#L67 for debugging
-class WSD(Scheduler):
-    """
-    Warmup-stable-decay scheduler
-    """
-
-    warmup_steps: Optional[int] = 2000
-    warmup_fraction: Optional[float] = None
-    decay_steps: Optional[int] = None
-    decay_fraction: Optional[float] = 0.1
-    warmup_min_lr: float = 0.0
-    decay_min_lr: float = 0.0
-
-    def __post_init__(self):
-        if (self.warmup_fraction is None) == (self.warmup_steps is None):
-            raise OLMoConfigurationError("Either warmup_fraction or warmup_steps must be specified.")
-        if self.warmup_fraction is not None and (self.warmup_fraction < 0 or self.warmup_fraction > 1):
-            raise OLMoConfigurationError("warmup_fraction must be between 0 and 1.")
-
-        if (self.decay_fraction is None) == (self.decay_steps is None):
-            raise OLMoConfigurationError("Either decay_fraction or decay_steps must be specified.")
-        if self.decay_fraction is not None and (self.decay_fraction < 0 or self.decay_fraction > 1):
-            raise OLMoConfigurationError("decay_fraction must be between 0 and 1.")
-
-    def get_lr(
-        self, initial_lr: Union[float, torch.Tensor], step: int, max_steps: int
-    ) -> Union[float, torch.Tensor]:
-        if self.warmup_steps is None:
-            warmup_steps = round(max_steps * self.warmup_fraction) if self.warmup_fraction is not None else 0
-        else:
-            warmup_steps = self.warmup_steps
-
-        if step <= warmup_steps:
-            return _linear_warmup(initial_lr, step, warmup_steps, self.warmup_min_lr)
-
-        if self.decay_steps is None:
-            decay_steps = round(max_steps * self.decay_fraction) if self.decay_fraction is not None else 0
-        else:
-            decay_steps = self.decay_steps
-
-        if step >= max_steps - decay_steps:
-            return _linear_decay(initial_lr, max_steps - step, decay_steps, self.decay_min_lr)
-
-        del step, max_steps
-        return initial_lr
-
-
-def _linear_warmup(
-    initial_lr: Union[float, torch.Tensor], step: int, warmup_steps: int, warmup_min_lr: float = 0.0
-) -> Union[float, torch.Tensor]:
-    if isinstance(initial_lr, float):  # not worth the potential host-device sync if it's a tensor
-        assert 0 <= warmup_min_lr < initial_lr
-    return warmup_min_lr + (initial_lr - warmup_min_lr) * min(step, warmup_steps) / warmup_steps
-
-
-def _linear_decay(
-    initial_lr: Union[float, torch.Tensor], step_from_end: int, decay_steps: int, decay_min_lr: float = 0.0
-) -> Union[float, torch.Tensor]:
-    if isinstance(initial_lr, float):  # not worth the potential host-device sync if it's a tensor
-        assert 0 <= decay_min_lr < initial_lr
-
-    return decay_min_lr + (initial_lr - decay_min_lr) * min(step_from_end, decay_steps) / decay_steps
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo25_7b-with-reasoning-10B-anneal-2T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo25_7b-with-reasoning-10B-anneal-2T.yaml
new file mode 100644
index 000000000..7948ae884
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo25_7b-with-reasoning-10B-anneal-2T.yaml
@@ -0,0 +1,68 @@
+name: "anneal-round1-10B-olmo25_7b_with-reasoning-anneal-2T"
+description: "OLMo2.5 7b anneal to 10B Tokens on Round 1 midtraining mix WITH reasoning data from 2T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo25_7b"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+  initial_lr: 0.00020712352850360292
+load_path: gs://ai2-llm/checkpoints/OLMo25/step476838
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo3_7b-with-reasoning-10B-anneal-4T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo3_7b-with-reasoning-10B-anneal-4T.yaml
new file mode 100644
index 000000000..4b44e2d6b
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo3_7b-with-reasoning-10B-anneal-4T.yaml
@@ -0,0 +1,67 @@
+name: "anneal-round1-10B-olmo3_7b_with-reasoning-anneal-4T"
+description: "OLMo3 7b anneal to 10B Tokens on Round 1 midtraining mix WITH reasoning data from 4T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo25_7b-10B-anneal-2T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo25_7b-10B-anneal-2T.yaml
new file mode 100644
index 000000000..3eedcacc3
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo25_7b-10B-anneal-2T.yaml
@@ -0,0 +1,86 @@
+name: "anneal-round3-10B-olmo25_7b-anneal-2T"
+description: "OLMo2.5 7b anneal to 10B Tokens on Round 3 midtraining mix from 2T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo25_7b"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+  initial_lr: 0.00020712352850360292
+load_path: gs://ai2-llm/checkpoints/OLMo25/step476838
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.40
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: megamath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo3_7b-10B-anneal-4T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo3_7b-10B-anneal-4T.yaml
new file mode 100644
index 000000000..35fe83abd
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo3_7b-10B-anneal-4T.yaml
@@ -0,0 +1,85 @@
+name: "anneal-round3-10B-olmo3_7b-anneal-4T"
+description: "OLMo3 7b anneal to 10B Tokens on Round 3 midtraining mix from 4T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.40
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: megamath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo25_7b-10B-anneal-decon-2T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo25_7b-10B-anneal-decon-2T.yaml
new file mode 100644
index 000000000..a0b048432
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo25_7b-10B-anneal-decon-2T.yaml
@@ -0,0 +1,355 @@
+name: "anneal-round5-10B-olmo25_7b-anneal-2T"
+description: "OLMo2.5 7b anneal to 10B Tokens on Round 5 midtraining mix from 2T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo25_7b"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+  initial_lr: 0.00020712352850360292
+load_path: gs://ai2-llm/checkpoints/OLMo25/step476838
+load_state: false
+dataset:
+  sources:
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy
+  - name: code_fim
+    target_ratio: 0.1
+    paths: 
+    # 21,390,279,634
+    - s3://ai2-llm/preprocessed/stack-edu/sample-fim-weighted-pl-edu-score-decon/**/**/*.npy
+  - name: swallowcode
+    target_ratio: 0.1
+    paths: 
+    # 18,833,636,683
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data-decon/allenai/dolma2-tokenizer/*.npy
+  - name: megamatt
+    # 20% less the ratio for dolminos2math
+    target_ratio: 0.01698
+    paths: 
+    # 3,883,674,937
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.18302 
+    paths:
+    # 5_624_449_531
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 10,687,987,907
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,848,999
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 240,590,380
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.059
+    paths:
+    # 9,860,465,314
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 21,390,279,634
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: rcqa
+    target_ratio: 0.03
+    paths:
+    # 4,215,210,848
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: nemotron-synth-qa
+    target_ratio: .05
+    paths:
+    # 486,558,362,887
+    - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy
+  # - name: instruction
+  #   target_ratio: 0.0
+  #   paths:
+  #   # 1,627,593,038
+  #   - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy
+  - name: instruction-new-format
+    target_ratio: 0.011
+    paths:
+    # 1,639,399,859
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy
+  - name: flan
+    target_ratio: .05
+    paths:
+    # 17,055,708,123
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 2,483,453,165
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 4,774,150,082
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    # 254,415,258
+    - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: llamanemotron_reasoning
+    target_ratio: .0125
+    paths:
+    # 20.9B
+    - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: openthoughts2
+    target_ratio: .0125
+    paths:
+    # 5,601,836,260
+    - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    # ORIGINALS
+    # 1,198,073,462 --> actually code meta reasoning
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 366,757,554 --> has special tokens
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1,198,074,765 --> different version of code meta reasoning
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    ####
+    # FIXES
+    # 1,049,524,455 --> actual math meta reasoning
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-fixed-decon/allenai/dolma2-tokenizer/*.npy
+    # 364,483,656 --> verifiable/gpt-41 without special tokens
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/cleaned-documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 73,752,560 --> verifiable/gpt-o4-mini without special tokens
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/o4-mini-high-decon/allenai/dolma2-tokenizer/*.npy
+    ####
+    # NEW FORMAT
+    # 1,057,302,754 --> actual math meta-reasoning with latex format
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy
+    # 1,199,242,351 --> actual code meta-reasoning with latex format
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy
+
+  # PDF 0.05
+  - name: pdf-quality-art_design
+    target_ratio: 0.000035
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-crime_law
+    target_ratio: 0.000775
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-education_jobs
+    target_ratio: 0.00258
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-entertainment
+    target_ratio: 0.00011
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-finance_business
+    target_ratio: 0.0007
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-hardware
+    target_ratio: 0.00024
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-health
+    target_ratio: 0.005935
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-history
+    target_ratio: 0.00027
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-home_hobbies
+    target_ratio: 0.000035
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-industrial
+    target_ratio: 0.00269
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-literature
+    target_ratio: 0.00045
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-politics
+    target_ratio: 0.00092
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-religion
+    target_ratio: 0.00026
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-science_tech
+    target_ratio: 0.02976
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software
+    target_ratio: 0.00059
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software_dev
+    target_ratio: 0.00435
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-sports_fitness
+    target_ratio: 0.00019
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-transportation
+    target_ratio: 0.00011
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy
+
+  # Web 0.225
+  - name: web-all_dressed-snazzy2-v18-v20--adult_content
+    target_ratio: 0.0002556935
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--art_and_design
+    target_ratio: 0.0028240385
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--crime_and_law
+    target_ratio: 0.0065884955
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs
+    target_ratio: 0.0096539490
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware
+    target_ratio: 0.0077935890
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--entertainment
+    target_ratio: 0.0216076590
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty
+    target_ratio: 0.0000286700
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--finance_and_business
+    target_ratio: 0.0091422550
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--food_and_dining
+    target_ratio: 0.0031632340
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--games
+    target_ratio: 0.0154939065
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--health
+    target_ratio: 0.0224049655
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--history_and_geography
+    target_ratio: 0.0061185640
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies
+    target_ratio: 0.0020584890
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--industrial
+    target_ratio: 0.0035336085
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--literature
+    target_ratio: 0.0153408140
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--politics
+    target_ratio: 0.0048655335
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--religion
+    target_ratio: 0.0044465105
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology
+    target_ratio: 0.0473059895
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--social_life
+    target_ratio: 0.0010352740
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software_development
+    target_ratio: 0.0106495710
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software
+    target_ratio: 0.0250799635
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness
+    target_ratio: 0.0029238175
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--transportation
+    target_ratio: 0.0021929705
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism
+    target_ratio: 0.0004924385
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo3_7b-10B-anneal-decon-4T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo3_7b-10B-anneal-decon-4T.yaml
new file mode 100644
index 000000000..d4fbc9f75
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo3_7b-10B-anneal-decon-4T.yaml
@@ -0,0 +1,354 @@
+name: "anneal-round5-10B-olmo3_7b-anneal-4T"
+description: "OLMo3 7b anneal to 10B Tokens on Round 5 midtraining mix from 4T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy
+  - name: code_fim
+    target_ratio: 0.1
+    paths: 
+    # 21,390,279,634
+    - s3://ai2-llm/preprocessed/stack-edu/sample-fim-weighted-pl-edu-score-decon/**/**/*.npy
+  - name: swallowcode
+    target_ratio: 0.1
+    paths: 
+    # 18,833,636,683
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data-decon/allenai/dolma2-tokenizer/*.npy
+  - name: megamatt
+    # 20% less the ratio for dolminos2math
+    target_ratio: 0.01698
+    paths: 
+    # 3,883,674,937
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.18302 
+    paths:
+    # 5_624_449_531
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 10,687,987,907
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,848,999
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 240,590,380
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.059
+    paths:
+    # 9,860,465,314
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 21,390,279,634
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: rcqa
+    target_ratio: 0.03
+    paths:
+    # 4,215,210,848
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: nemotron-synth-qa
+    target_ratio: .05
+    paths:
+    # 486,558,362,887
+    - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy
+  # - name: instruction
+  #   target_ratio: 0.0
+  #   paths:
+  #   # 1,627,593,038
+  #   - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy
+  - name: instruction-new-format
+    target_ratio: 0.011
+    paths:
+    # 1,639,399,859
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy
+  - name: flan
+    target_ratio: .05
+    paths:
+    # 17,055,708,123
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 2,483,453,165
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 4,774,150,082
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    # 254,415,258
+    - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: llamanemotron_reasoning
+    target_ratio: .0125
+    paths:
+    # 20.9B
+    - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: openthoughts2
+    target_ratio: .0125
+    paths:
+    # 5,601,836,260
+    - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    # ORIGINALS
+    # 1,198,073,462 --> actually code meta reasoning
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 366,757,554 --> has special tokens
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1,198,074,765 --> different version of code meta reasoning
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    ####
+    # FIXES
+    # 1,049,524,455 --> actual math meta reasoning
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-fixed-decon/allenai/dolma2-tokenizer/*.npy
+    # 364,483,656 --> verifiable/gpt-41 without special tokens
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/cleaned-documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 73,752,560 --> verifiable/gpt-o4-mini without special tokens
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/o4-mini-high-decon/allenai/dolma2-tokenizer/*.npy
+    ####
+    # NEW FORMAT
+    # 1,057,302,754 --> actual math meta-reasoning with latex format
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy
+    # 1,199,242,351 --> actual code meta-reasoning with latex format
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy
+
+  # PDF 0.05
+  - name: pdf-quality-art_design
+    target_ratio: 0.000035
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-crime_law
+    target_ratio: 0.000775
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-education_jobs
+    target_ratio: 0.00258
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-entertainment
+    target_ratio: 0.00011
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-finance_business
+    target_ratio: 0.0007
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-hardware
+    target_ratio: 0.00024
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-health
+    target_ratio: 0.005935
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-history
+    target_ratio: 0.00027
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-home_hobbies
+    target_ratio: 0.000035
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-industrial
+    target_ratio: 0.00269
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-literature
+    target_ratio: 0.00045
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-politics
+    target_ratio: 0.00092
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-religion
+    target_ratio: 0.00026
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-science_tech
+    target_ratio: 0.02976
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software
+    target_ratio: 0.00059
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software_dev
+    target_ratio: 0.00435
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-sports_fitness
+    target_ratio: 0.00019
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-transportation
+    target_ratio: 0.00011
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy
+
+  # Web 0.225
+  - name: web-all_dressed-snazzy2-v18-v20--adult_content
+    target_ratio: 0.0002556935
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--art_and_design
+    target_ratio: 0.0028240385
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--crime_and_law
+    target_ratio: 0.0065884955
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs
+    target_ratio: 0.0096539490
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware
+    target_ratio: 0.0077935890
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--entertainment
+    target_ratio: 0.0216076590
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty
+    target_ratio: 0.0000286700
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--finance_and_business
+    target_ratio: 0.0091422550
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--food_and_dining
+    target_ratio: 0.0031632340
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--games
+    target_ratio: 0.0154939065
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--health
+    target_ratio: 0.0224049655
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--history_and_geography
+    target_ratio: 0.0061185640
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies
+    target_ratio: 0.0020584890
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--industrial
+    target_ratio: 0.0035336085
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--literature
+    target_ratio: 0.0153408140
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--politics
+    target_ratio: 0.0048655335
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--religion
+    target_ratio: 0.0044465105
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology
+    target_ratio: 0.0473059895
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--social_life
+    target_ratio: 0.0010352740
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software_development
+    target_ratio: 0.0106495710
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software
+    target_ratio: 0.0250799635
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness
+    target_ratio: 0.0029238175
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--transportation
+    target_ratio: 0.0021929705
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism
+    target_ratio: 0.0004924385
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/fae-olmo3_7b-llamma-nemotron-reasonig-5b-sub8k.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/fae-olmo3_7b-llamma-nemotron-reasonig-5b-sub8k.yaml
new file mode 100644
index 000000000..53af198e0
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/fae-olmo3_7b-llamma-nemotron-reasonig-5b-sub8k.yaml
@@ -0,0 +1,41 @@
+name: "math-code-with-llamma-nemotron-reasoning-olmo3-microanneal-5b-sub8k"
+description: "OLMo3 7b 5B web, math and code with llamma nemotron microanneal -- under 8k length"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 5_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.4
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.3
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: bigreasoningtraces 
+    target_ratio: 0.1
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-sub8k-flat/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-anneal-round5-gen-mc-ratios-10B-olmo3_7b-12T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-anneal-round5-gen-mc-ratios-10B-olmo3_7b-12T-microanneal.yaml
new file mode 100644
index 000000000..b2d99f7fb
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-anneal-round5-gen-mc-ratios-10B-olmo3_7b-12T-microanneal.yaml
@@ -0,0 +1,260 @@
+name: "mixing-gen-mc-round5-ratios-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B mixing gen/MC validation with round 5 ratios"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: sponge
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.09523809523809523
+  - name: reddit
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.11238095238095237
+  - name: rcqa
+    paths:
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.057142857142857134
+  - name: nemotron-synth-qa
+    paths:
+    - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.09523809523809523
+  - name: instruction-new-format
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.02095238095238095
+  - name: flan
+    paths:
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.09523809523809523
+  - name: pdf-quality-art_design
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 6.666666666666666e-05
+  - name: pdf-quality-crime_law
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.001476190476190476
+  - name: pdf-quality-education_jobs
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.004914285714285714
+  - name: pdf-quality-entertainment
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0002095238095238095
+  - name: pdf-quality-finance_business
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0013333333333333333
+  - name: pdf-quality-hardware
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.00045714285714285713
+  - name: pdf-quality-health
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.011304761904761904
+  - name: pdf-quality-history
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0005142857142857143
+  - name: pdf-quality-home_hobbies
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 6.666666666666666e-05
+  - name: pdf-quality-industrial
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0051238095238095235
+  - name: pdf-quality-literature
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0008571428571428571
+  - name: pdf-quality-politics
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0017523809523809523
+  - name: pdf-quality-religion
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0004952380952380951
+  - name: pdf-quality-science_tech
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.056685714285714284
+  - name: pdf-quality-software
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0011238095238095239
+  - name: pdf-quality-software_dev
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.008285714285714285
+  - name: pdf-quality-sports_fitness
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0003619047619047619
+  - name: pdf-quality-transportation
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0002095238095238095
+  - name: web-all_dressed-snazzy2-v18-v20--adult_content
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.00048703523809523805
+  - name: web-all_dressed-snazzy2-v18-v20--art_and_design
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.005379120952380952
+  - name: web-all_dressed-snazzy2-v18-v20--crime_and_law
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.012549515238095239
+  - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.018388474285714284
+  - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.014844931428571427
+  - name: web-all_dressed-snazzy2-v18-v20--entertainment
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.041157445714285715
+  - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 5.460952380952381e-05
+  - name: web-all_dressed-snazzy2-v18-v20--finance_and_business
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.017413819047619048
+  - name: web-all_dressed-snazzy2-v18-v20--food_and_dining
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.006025207619047619
+  - name: web-all_dressed-snazzy2-v18-v20--games
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.029512202857142854
+  - name: web-all_dressed-snazzy2-v18-v20--health
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.04267612476190476
+  - name: web-all_dressed-snazzy2-v18-v20--history_and_geography
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.011654407619047619
+  - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.003920931428571428
+  - name: web-all_dressed-snazzy2-v18-v20--industrial
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.006730682857142857
+  - name: web-all_dressed-snazzy2-v18-v20--literature
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.029220598095238092
+  - name: web-all_dressed-snazzy2-v18-v20--politics
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.009267682857142856
+  - name: web-all_dressed-snazzy2-v18-v20--religion
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.008469543809523809
+  - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.09010664666666666
+  - name: web-all_dressed-snazzy2-v18-v20--social_life
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0019719504761904765
+  - name: web-all_dressed-snazzy2-v18-v20--software_development
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.020284897142857142
+  - name: web-all_dressed-snazzy2-v18-v20--software
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.04777135904761905
+  - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.005569176190476191
+  - name: web-all_dressed-snazzy2-v18-v20--transportation
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.004177086666666666
+  - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0009379780952380953
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-nat-dist-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-nat-dist-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml
new file mode 100644
index 000000000..1edca9daf
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-nat-dist-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml
@@ -0,0 +1,276 @@
+name: "mixing-gen-mc-nat-dist-ratios-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B mixing gen/MC validation with natural distribution ratios"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: sponge
+    # 5,220,997,516
+    target_ratio: 0.002492064670205875
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/*/allenai/dolma2-tokenizer/*.npy
+  - name: reddit-high
+    # 9,860,465,314
+    target_ratio: 0.004706556010705806
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: reddit-low
+    # 11,789,114,475
+    target_ratio: 0.005627130751571149
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: rcqa
+    # 4,215,210,848
+    target_ratio: 0.002011986789799757
+    paths:
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: nemotron-synth-qa
+    # 486,558,362,887
+    target_ratio: 0.2322420001978607
+    paths:
+    - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy
+  - name: instruction-new-format
+    # 1,639,399,859
+    target_ratio: 0.0007825114753328669
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy
+  - name: flan
+    # 17,055,708,123
+    target_ratio: 0.008140959176558946
+    paths:
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy
+
+  #PDF-Web 0.7439967909279649
+  # 95,853,782,702
+  # 144,337,465,243
+  # 737,026,615,245
+  # 581,491,845,987
+  - name: pdf-quality-art_design
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 7.891913084101734e-05
+  - name: pdf-quality-crime_law
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.001747495040051098
+  - name: pdf-quality-education_jobs
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.005817467359137849
+  - name: pdf-quality-entertainment
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.00024803155407176884
+  - name: pdf-quality-finance_business
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0015783826168203467
+  - name: pdf-quality-hardware
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0005411597543384047
+  - name: pdf-quality-health
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.013382429758326798
+  - name: pdf-quality-history
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0006088047236307052
+  - name: pdf-quality-home_hobbies
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 7.891913084101734e-05
+  - name: pdf-quality-industrial
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.006065498913209619
+  - name: pdf-quality-literature
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0010146745393845088
+  - name: pdf-quality-politics
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0020744457249638848
+  - name: pdf-quality-religion
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0005862564005332716
+  - name: pdf-quality-science_tech
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.06710380953796218
+  - name: pdf-quality-software
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0013303510627485783
+  - name: pdf-quality-software_dev
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.009808520547383584
+  - name: pdf-quality-sports_fitness
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.00042841813885123695
+  - name: pdf-quality-transportation
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.00024803155407176884
+  - name: web-all_dressed-snazzy2-v18-v20--adult_content
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0007173682024900249
+  - name: web-all_dressed-snazzy2-v18-v20--art_and_design
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.007923061878802653
+  - name: web-all_dressed-snazzy2-v18-v20--crime_and_law
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.018484541742158554
+  - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.02708491236992874
+  - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.021865526233072138
+  - name: web-all_dressed-snazzy2-v18-v20--entertainment
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.06062198490320407
+  - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 8.043593742269167e-05
+  - name: web-all_dressed-snazzy2-v18-v20--finance_and_business
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.025649314652329617
+  - name: web-all_dressed-snazzy2-v18-v20--food_and_dining
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.00887470150252287
+  - name: web-all_dressed-snazzy2-v18-v20--games
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.04346937194513554
+  - name: web-all_dressed-snazzy2-v18-v20--health
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.06285889092834201
+  - name: web-all_dressed-snazzy2-v18-v20--history_and_geography
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.01716611199932801
+  - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.005775252612113678
+  - name: web-all_dressed-snazzy2-v18-v20--industrial
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.009913816260282225
+  - name: web-all_dressed-snazzy2-v18-v20--literature
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.04303985890886475
+  - name: web-all_dressed-snazzy2-v18-v20--politics
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.013650636488803977
+  - name: web-all_dressed-snazzy2-v18-v20--religion
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.012475034542286063
+  - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.13272067007814822
+  - name: web-all_dressed-snazzy2-v18-v20--social_life
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0029045425420069658
+  - name: web-all_dressed-snazzy2-v18-v20--software_development
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.029878208110725914
+  - name: web-all_dressed-snazzy2-v18-v20--software
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0703638079752142
+  - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.008202999702314992
+  - name: web-all_dressed-snazzy2-v18-v20--transportation
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0061525510257345265
+  - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    target_ratio: 0.0013815748995648469
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-proposed-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-proposed-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml
new file mode 100644
index 000000000..dc391a0bd
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-proposed-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml
@@ -0,0 +1,263 @@
+name: "mixing-gen-mc-proposed-ratios-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B mixing gen/MC validation with proposed ratios"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: pdf-quality-art_design
+    target_ratio: 7.016231376725744e-05
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-crime_law
+    target_ratio: 0.0015535940905607002
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-education_jobs
+    target_ratio: 0.005171964843414975
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-entertainment
+    target_ratio: 0.00022051012898280912
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-finance_business
+    target_ratio: 0.0014032462753451485
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-hardware
+    target_ratio: 0.0004811130086897653
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-health
+    target_ratio: 0.011897523777390655
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-history
+    target_ratio: 0.0005412521347759859
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-home_hobbies
+    target_ratio: 7.016231376725744e-05
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-industrial
+    target_ratio: 0.005392474972397786
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-literature
+    target_ratio: 0.00090208689129331
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-politics
+    target_ratio: 0.001844266533310767
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-religion
+    target_ratio: 0.0005212057594139124
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-science_tech
+    target_ratio: 0.0596580130775309
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software
+    target_ratio: 0.0011827361463623398
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software_dev
+    target_ratio: 0.008720173282501995
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-sports_fitness
+    target_ratio: 0.0003808811318793975
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-transportation
+    target_ratio: 0.00022051012898280912
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--adult_content
+    target_ratio: 0.0006377694783683423
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--art_and_design
+    target_ratio: 0.007043923920776696
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--crime_and_law
+    target_ratio: 0.01643350862758409
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs
+    target_ratio: 0.024079587544949644
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware
+    target_ratio: 0.01943934120791984
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--entertainment
+    target_ratio: 0.05389540762354546
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty
+    target_ratio: 7.151081644555054e-05
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--finance_and_business
+    target_ratio: 0.022803282846299847
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--food_and_dining
+    target_ratio: 0.007889970210963536
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--games
+    target_ratio: 0.03864603780069838
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--health
+    target_ratio: 0.05588410796440155
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--history_and_geography
+    target_ratio: 0.015261371018986869
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies
+    target_ratio: 0.005134434218143875
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--industrial
+    target_ratio: 0.008813785449387414
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--literature
+    target_ratio: 0.0382641832605278
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--politics
+    target_ratio: 0.012135970457906422
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--religion
+    target_ratio: 0.011090812563261708
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology
+    target_ratio: 0.1179940680819547
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--social_life
+    target_ratio: 0.00258225633013083
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software_development
+    target_ratio: 0.02656294094889634
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software
+    target_ratio: 0.06255628414055135
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness
+    target_ratio: 0.007292800019629874
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--transportation
+    target_ratio: 0.005469867837321492
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism
+    target_ratio: 0.0012282762184939744
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.062108549485272356
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy
+  - name: reddit-high
+    target_ratio: 0.11904721336433728
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: reddit-low
+    target_ratio: 0.0009788837065593818
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: rcqa
+    target_ratio: 0.0012504569520369563
+    paths:
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: nemotron-synth-qa
+    target_ratio: 0.06109484203464316
+    paths:
+    - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy
+  - name: instruction-new-format
+    target_ratio: 0.023028775198057915
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy
+  - name: flan
+    target_ratio: 0.07104790386157962
+    paths:
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/testrun-webv18-redditv1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/testrun-webv18-redditv1-10B-microanneal.yaml
new file mode 100644
index 000000000..646d97ea6
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/testrun-webv18-redditv1-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv19-redditv1-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit v1 microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-finemath-stackedufim-omrfullthoughts-5B-olmo3_7B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-finemath-stackedufim-omrfullthoughts-5B-olmo3_7B-microanneal.yaml
new file mode 100644
index 000000000..b68c3589a
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-finemath-stackedufim-omrfullthoughts-5B-olmo3_7B-microanneal.yaml
@@ -0,0 +1,41 @@
+name: "math-code-with-omrfullthoughts-olmo3-microanneal-5b"
+description: "OLMo3 7b 5B web, math and code with OMR fulthoughts microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 5_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.4
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    target_ratio: 0.3
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: omrfullthoughts 
+    target_ratio: 0.1
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-mathmeta-codemeta-verifiable-5B-olmo3_7B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-mathmeta-codemeta-verifiable-5B-olmo3_7B-microanneal.yaml
new file mode 100644
index 000000000..8467bf152
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-mathmeta-codemeta-verifiable-5B-olmo3_7B-microanneal.yaml
@@ -0,0 +1,39 @@
+name: "webv18-mathmeta-codemeta-verifiable-5B-olmo3-4T-microanneal"
+description: "OLMo3 7b 5B web v18 + math meta + code meta + verifiable microanneal (4T)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 5_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: 0.5
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-only-5B-olmo3_7B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-only-5B-olmo3_7B-microanneal.yaml
new file mode 100644
index 000000000..43f379847
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-only-5B-olmo3_7B-microanneal.yaml
@@ -0,0 +1,29 @@
+name: "webv18-only-5B-olmo3-4T-microanneal"
+description: "OLMo3 7b 5B web v18 only microanneal (4T)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 5_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 1.0
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-lowthresh-NON_MC-rewrites-5B-olmo3_7B-4T_microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-lowthresh-NON_MC-rewrites-5B-olmo3_7B-4T_microanneal.yaml
new file mode 100644
index 000000000..9d3b4ef1f
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-lowthresh-NON_MC-rewrites-5B-olmo3_7B-4T_microanneal.yaml
@@ -0,0 +1,34 @@
+name: "webv18-reddit-lowthresh-nonMC-5B-olmo3-4T-microanneal"
+description: "OLMo3 7b 5B web v18 + reddit lowthresh nonMC rewrites (4T)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 5_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.54
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: nonMC_reddit_lowthresh
+    target_ratio: 0.46
+    paths:
+    #2.32B
+    - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_rewrites_nonmc-tokenized/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-psgqav1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-psgqav1-10B-microanneal.yaml
new file mode 100644
index 000000000..cd87a5921
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-psgqav1-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv18-psgqav1-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B web v18 + passageQA v1 microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.58
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: psgqa 
+    target_ratio: 0.42
+    paths:
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-highthresh-diverseqa-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-highthresh-diverseqa-10B-microanneal.yaml
new file mode 100644
index 000000000..ca0210813
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-highthresh-diverseqa-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv18-reddit-highthresh-diverseqa-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit highthresh with diversified QA formats microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_diverseqa/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-addcontextv1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-addcontextv1-10B-microanneal.yaml
new file mode 100644
index 000000000..bc96ba2ea
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-addcontextv1-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv18-reddit-lowthresh-addcontextv1-10B-olmo3-4T-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit lowthresh add-context v1 (no choices) microanneal (4T)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_addcontext-v1_tokenized/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-mcplusfull-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-mcplusfull-10B-microanneal.yaml
new file mode 100644
index 000000000..f89425f9d
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-mcplusfull-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv18-reddit-lowthresh-mcplusfull-10B-olmo3-4T-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit lowthresh MC plus full answer format microanneal (4T)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_rewrites_mcplusfull_tokenized/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh663-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh663-10B-microanneal.yaml
new file mode 100644
index 000000000..2ab84f5ea
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh663-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv18-reddit-lowthresh663-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit lowthresh 663 microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_rewrites_663_tokenized/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1-10B-microanneal.yaml
new file mode 100644
index 000000000..646d97ea6
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1-10B-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "webv19-redditv1-10B-olmo3-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit v1 microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1and2-10B-olmo3_7B-4T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1and2-10B-olmo3_7B-4T-microanneal.yaml
new file mode 100644
index 000000000..ce64c009d
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1and2-10B-olmo3_7B-4T-microanneal.yaml
@@ -0,0 +1,34 @@
+name: "webv18-redditv1and2-10B-olmo3-4T-microanneal"
+description: "OLMo3 7b 10B web v18 + reddit v1 (highthresh) and v2 (lowthresh) microanneal (4T)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: reddit 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-no-reasoning-anneal.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-no-reasoning-anneal.yaml
new file mode 100644
index 000000000..c84eff439
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-no-reasoning-anneal.yaml
@@ -0,0 +1,57 @@
+name: "anneal-round1-100B-olmo3_7b_no-reasoning-anneal"
+description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITHOUT reasoning data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.5
+    repetition_factor: 1.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    # 8_923_780_609 tokens
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-12T.yaml
new file mode 100644
index 000000000..4210cc2a2
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-12T.yaml
@@ -0,0 +1,67 @@
+name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal-12T"
+description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data from 12T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T-largebatchLR.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T-largebatchLR.yaml
new file mode 100644
index 000000000..888f5c822
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T-largebatchLR.yaml
@@ -0,0 +1,68 @@
+name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal-7T-largebatchLR"
+description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data from 7T ckpt with larger batch size and LR"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 16777216
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+  initial_lr: 0.00024890158
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step467000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T.yaml
new file mode 100644
index 000000000..6c7c2ea17
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T.yaml
@@ -0,0 +1,67 @@
+name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal-7T"
+description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data from 7T ckpt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step467000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal.yaml
new file mode 100644
index 000000000..ac728febe
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal.yaml
@@ -0,0 +1,67 @@
+name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal"
+description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-12T.yaml
new file mode 100644
index 000000000..f1d3dedb6
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-12T.yaml
@@ -0,0 +1,80 @@
+name: "anneal-round2-100B-olmo3_7b_with-reasoning-anneal-12T"
+description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt -- round 2 data mix"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-7T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-7T.yaml
new file mode 100644
index 000000000..f2d882b1a
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-7T.yaml
@@ -0,0 +1,80 @@
+name: "anneal-round2-100B-olmo3_7b_with-reasoning-anneal-7T"
+description: "OLMo3 7b anneal to 100B Tokens from 7T ckpt -- round 2 data mix"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step467000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-8T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-8T.yaml
new file mode 100644
index 000000000..96a779ddc
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-8T.yaml
@@ -0,0 +1,80 @@
+name: "anneal-round2-100B-olmo3_7b_with-reasoning-anneal-8T"
+description: "OLMo3 7b anneal to 100B Tokens from 8T ckpt -- round 2 data mix"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-webround1-olmo3_7b-anneal-8T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-webround1-olmo3_7b-anneal-8T.yaml
new file mode 100644
index 000000000..6c7ea22da
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-webround1-olmo3_7b-anneal-8T.yaml
@@ -0,0 +1,80 @@
+name: "anneal-round2-webround1-100B-olmo3_7b_with-reasoning-anneal-8T"
+description: "OLMo3 7b anneal to 100B Tokens from 8T ckpt -- round 2 data mix with round 1 web data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-200B.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-200B.yaml
new file mode 100644
index 000000000..1f9f964c7
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-200B.yaml
@@ -0,0 +1,98 @@
+name: "anneal-round3-webround2-200B-olmo3_7b_with-reasoning-anneal-12T"
+description: "OLMo3 7b anneal to 200B Tokens from 12T ckpt -- round 3 data mix with round2 web data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 200_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.425
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0020/*.npy
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0020/*.npy
+  - name: sponge
+    target_ratio: 0.025
+    paths:
+    - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - s3://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-all/allenai/dolma2-tokenizer/**/**/*.npy
+  - name: megamath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.1366
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.0634
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.092
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+  - name: instruction
+    target_ratio: 0.008
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: reasoning
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-noinstruct-noreasoning.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-noinstruct-noreasoning.yaml
new file mode 100644
index 000000000..9f176b114
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-noinstruct-noreasoning.yaml
@@ -0,0 +1,85 @@
+name: "anneal-round3-webround2-100B-olmo3_7b_no-reasoning-no-instruct-anneal-12T"
+description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt -- round 3 data mix with round2 web data, no instruction or reasoning data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.461
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: megamath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0931
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1069
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # # 898,733,958
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # # 240,590,634
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+  # - name: instruction
+  #   target_ratio: 0.011
+  #   paths:
+  #   - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  # - name: r1_reasoning
+  #   target_ratio: 0.01875
+  #   paths:
+  #   - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  # - name: qwq_reasoning
+  #   target_ratio: 0.01875
+  #   paths:
+  #   - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  # - name: gemini_reasoning
+  #   target_ratio: 0.0025
+  #   paths:
+  #   - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  # - name: faeze_reasoning
+  #   target_ratio: .01
+  #   paths:
+  #   #3.166B
+  #   #1,144,531,442
+  #   - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+  #   #366,757,554
+  #   - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+  #   #1,198,181,281
+  #   - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T.yaml
new file mode 100644
index 000000000..0c5f40496
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T.yaml
@@ -0,0 +1,85 @@
+name: "anneal-round3-webround2-100B-olmo3_7b_with-reasoning-anneal-12T"
+description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt -- round 3 data mix with round2 web data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.40
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: megamath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-8T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-8T.yaml
new file mode 100644
index 000000000..366110b96
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-8T.yaml
@@ -0,0 +1,85 @@
+name: "anneal-round3-webround2-100B-olmo3_7b_with-reasoning-anneal-8T"
+description: "OLMo3 7b anneal to 100B Tokens from 8T ckpt -- round 3 data mix with round2 web data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.40
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: megamath
+    # 10% less the .0268 over 10% for dolminos2math
+    target_ratio: 0.0732
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1268
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round4-olmo3_7b-anneal-decon-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round4-olmo3_7b-anneal-decon-12T.yaml
new file mode 100644
index 000000000..957734dd3
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round4-olmo3_7b-anneal-decon-12T.yaml
@@ -0,0 +1,105 @@
+name: "anneal-round4-100B-olmo3_7b-anneal-decon-12T"
+description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt with decon -- round 4 data mix"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.35
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/*/allenai/dolma2-tokenizer/*.npy
+  - name: code
+    target_ratio: 0.2
+    paths: 
+    - s3://ai2-llm/preprocessed/stack-edu/fim/documents-decon-2/fim_50pct_psm_50pct/*/allenai/dolma2-tokenizer/*.npy
+  - name: megamatt
+    # 20% less the ratio for dolminos2math
+    # 3_906_854_120
+    target_ratio: 0.01698
+    paths: 
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.18302 
+    paths:
+    # 5_624_449_531
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 10.7B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,859,688
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 240,590,634
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.059
+    paths:
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: rcqa
+    target_ratio: 0.03
+    paths:
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 2.48B
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 4.77B
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    # 254M
+    - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: llamanemotron_reasoning
+    target_ratio: .025
+    paths:
+    # 20.9B
+    - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: openthoughts2
+    target_ratio: .025
+    paths:
+    # 5.60B
+    - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    #3.166B
+    #1,144,531,442
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    #366,757,554
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    #1,198,181,281
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round5-olmo3_7b-anneal-decon-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round5-olmo3_7b-anneal-decon-12T.yaml
new file mode 100644
index 000000000..5b998aa08
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round5-olmo3_7b-anneal-decon-12T.yaml
@@ -0,0 +1,354 @@
+name: "anneal-round5-100B-olmo3_7b-anneal-decon-12T"
+description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt with decon -- round 5 data mix"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906
+load_state: false
+dataset:
+  sources:
+  - name: sponge
+    target_ratio: 0.05
+    paths:
+    - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy
+  - name: code_fim
+    target_ratio: 0.1
+    paths: 
+    # 21,390,279,634
+    - s3://ai2-llm/preprocessed/stack-edu/sample-fim-weighted-pl-edu-score-decon/**/**/*.npy
+  - name: swallowcode
+    target_ratio: 0.1
+    paths: 
+    # 18,833,636,683
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data-decon/allenai/dolma2-tokenizer/*.npy
+  - name: megamatt
+    # 20% less the ratio for dolminos2math
+    target_ratio: 0.01698
+    paths: 
+    # 3,883,674,937
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.18302 
+    paths:
+    # 5_624_449_531
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 10,687,987,907
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1.99B
+    # 850,848,999
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 898,733,958
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 240,590,380
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.059
+    paths:
+    # 9,860,465,314
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 21,390,279,634
+    - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: rcqa
+    target_ratio: 0.03
+    paths:
+    # 4,215,210,848
+    - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: nemotron-synth-qa
+    target_ratio: .05
+    paths:
+    # 486,558,362,887
+    - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy
+  # - name: instruction
+  #   target_ratio: 0.0
+  #   paths:
+  #   # 1,627,593,038
+  #   - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy
+  - name: instruction-new-format
+    target_ratio: 0.011
+    paths:
+    # 1,639,399,859
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy
+  - name: flan
+    target_ratio: .05
+    paths:
+    # 17,055,708,123
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 2,483,453,165
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.01875
+    paths:
+    # 4,774,150,082
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    # 254,415,258
+    - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: llamanemotron_reasoning
+    target_ratio: .0125
+    paths:
+    # 20.9B
+    - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: openthoughts2
+    target_ratio: .0125
+    paths:
+    # 5,601,836,260
+    - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy
+  - name: faeze_reasoning
+    target_ratio: .01
+    paths:
+    # ORIGINALS
+    # 1,198,073,462 --> actually code meta reasoning
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 366,757,554 --> has special tokens
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 1,198,074,765 --> different version of code meta reasoning
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    ####
+    # FIXES
+    # 1,049,524,455 --> actual math meta reasoning
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-fixed-decon/allenai/dolma2-tokenizer/*.npy
+    # 364,483,656 --> verifiable/gpt-41 without special tokens
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/cleaned-documents-decon-2/allenai/dolma2-tokenizer/*.npy
+    # 73,752,560 --> verifiable/gpt-o4-mini without special tokens
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/o4-mini-high-decon/allenai/dolma2-tokenizer/*.npy
+    ####
+    # NEW FORMAT
+    # 1,057,302,754 --> actual math meta-reasoning with latex format
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy
+    # 1,199,242,351 --> actual code meta-reasoning with latex format
+    # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy
+
+  # PDF 0.05
+  - name: pdf-quality-art_design
+    target_ratio: 0.000035
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-crime_law
+    target_ratio: 0.000775
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-education_jobs
+    target_ratio: 0.00258
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-entertainment
+    target_ratio: 0.00011
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-finance_business
+    target_ratio: 0.0007
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-hardware
+    target_ratio: 0.00024
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-health
+    target_ratio: 0.005935
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-history
+    target_ratio: 0.00027
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-home_hobbies
+    target_ratio: 0.000035
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-industrial
+    target_ratio: 0.00269
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-literature
+    target_ratio: 0.00045
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-politics
+    target_ratio: 0.00092
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-religion
+    target_ratio: 0.00026
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-science_tech
+    target_ratio: 0.02976
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software
+    target_ratio: 0.00059
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-software_dev
+    target_ratio: 0.00435
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-sports_fitness
+    target_ratio: 0.00019
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: pdf-quality-transportation
+    target_ratio: 0.00011
+    paths:
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy
+
+  # Web 0.225
+  - name: web-all_dressed-snazzy2-v18-v20--adult_content
+    target_ratio: 0.0002556935
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--art_and_design
+    target_ratio: 0.0028240385
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--crime_and_law
+    target_ratio: 0.0065884955
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs
+    target_ratio: 0.0096539490
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware
+    target_ratio: 0.0077935890
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--entertainment
+    target_ratio: 0.0216076590
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty
+    target_ratio: 0.0000286700
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--finance_and_business
+    target_ratio: 0.0091422550
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--food_and_dining
+    target_ratio: 0.0031632340
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--games
+    target_ratio: 0.0154939065
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--health
+    target_ratio: 0.0224049655
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--history_and_geography
+    target_ratio: 0.0061185640
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies
+    target_ratio: 0.0020584890
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--industrial
+    target_ratio: 0.0035336085
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--literature
+    target_ratio: 0.0153408140
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--politics
+    target_ratio: 0.0048655335
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--religion
+    target_ratio: 0.0044465105
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology
+    target_ratio: 0.0473059895
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--social_life
+    target_ratio: 0.0010352740
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software_development
+    target_ratio: 0.0106495710
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--software
+    target_ratio: 0.0250799635
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness
+    target_ratio: 0.0029238175
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--transportation
+    target_ratio: 0.0021929705
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy
+  - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism
+    target_ratio: 0.0004924385
+    paths:
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/dolminos-baseline-anneal-olmo3_7b-100B.yaml b/src/cookbook/recipes/olmo3-midtraining/dolminos-baseline-anneal-olmo3_7b-100B.yaml
new file mode 100644
index 000000000..9327407b5
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/dolminos-baseline-anneal-olmo3_7b-100B.yaml
@@ -0,0 +1,1185 @@
+name: "baseline-dolminos-anneal-100B-olmo3_7b"
+description: "Baseline dolmino mix OLMo3 7b anneal to 100B Tokens"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.5
+    repetition_factor: 1.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: dolmino
+    target_ratio: 0.5
+    paths:
+    #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT)
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT)
+    - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT)
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT)
+    - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT)
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT)
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT)
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT)
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT)
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT)
+    - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT)
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT)
+    - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT)
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT)
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT)
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT)
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT)
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/pes2o/allenai/dolma2-tokenizer/ (9.76BT)
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy
+  
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml
index 38a3608e2..af77a48eb 100644
--- a/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml
+++ b/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml
@@ -11,7 +11,7 @@ sequence_length: 4096
 seed: 1337
 model: "olmo2_7B"
 tokenizer: "dolma2"
-priority: urgent
+priority: high
 eval_interval: 250
 cluster: ai2/augusta-google-1
 rank_microbatch_size: 8192
diff --git a/src/cookbook/recipes/olmo3-midtraining/example-olmo3_7b-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/example-olmo3_7b-microanneal.yaml
new file mode 100644
index 000000000..fa7c2d451
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/example-olmo3_7b-microanneal.yaml
@@ -0,0 +1,33 @@
+name: "example-olmo3-microanneal"
+description: "OLMo3 7b 10B example microanneal"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: code 
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/**/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/lr-test-dolminos-baseline-anneal-olmo3_7b-100B.yaml b/src/cookbook/recipes/olmo3-midtraining/lr-test-dolminos-baseline-anneal-olmo3_7b-100B.yaml
new file mode 100644
index 000000000..1eeb594c6
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/lr-test-dolminos-baseline-anneal-olmo3_7b-100B.yaml
@@ -0,0 +1,1185 @@
+name: "lr-test-baseline-dolminos-anneal-100B-olmo3_7b"
+description: "LR test version of baseline dolmino mix OLMo3 7b anneal to 100B Tokens"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 16
+gpus: 8
+preemptible: true
+max_tokens: 100_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix-from283000-200B/step288419
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.5
+    repetition_factor: 1.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: dolmino
+    target_ratio: 0.5
+    paths:
+    #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT)
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT)
+    - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT)
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT)
+    - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT)
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT)
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT)
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT)
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT)
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT)
+    - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT)
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT)
+    - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT)
+    - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT)
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT)
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT)
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT)
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT)
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT)
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT)
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/pes2o/allenai/dolma2-tokenizer/ (9.76BT)
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy
+  #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT)
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy
+  
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_dolminos-math.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_dolminos-math.yaml
new file mode 100644
index 000000000..efb2de5e5
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_dolminos-math.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-dolminos_math_baseline"
+description: "OLMo3_4T 10B-microanneal with flat dolminos math dataset as a baseline"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math # 10.69B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_finemath.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_finemath.yaml
new file mode 100644
index 000000000..321a42990
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_finemath.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-finemath"
+description: "OLMo3_4T 10B-microanneal with finemath"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: fineMath # 34.06B tokens | 5B requested
+    target_ratio: 0.5  
+    paths:
+    - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web-pro.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web-pro.yaml
new file mode 100644
index 000000000..a2ebebaae
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web-pro.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj_megamath-web-pro
+description: microanneal just for megamath-web-pro (subsampled)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web/megamath-web-pro/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web.yaml
new file mode 100644
index 000000000..8b6389e2a
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj_megamath-web
+description: microanneal just for megamath-web (subsampled)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web/megamath-web_reshard-0.05/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math-bestof.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math-bestof.yaml
new file mode 100644
index 000000000..1883186ba
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math-bestof.yaml
@@ -0,0 +1,39 @@
+name: "microanneal-mjnewmath_bestof"
+description: "OLMo3_4T 10B-microanneal with mj's new math data and the best of Scott's new stuff"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 24.7B tokens | 8.01B requested
+    target_ratio: 0.80102
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: tinyMATH_MIND # 1.139B tokens | 1.139B requested
+    target_ratio: 0.1139
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: fullthoughts # 0.85085B tokens | 0.85BB requested
+    target_ratio: 0.08508
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math.yaml
new file mode 100644
index 000000000..ad34104f8
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math.yaml
@@ -0,0 +1,39 @@
+name: "microanneal-mjnewmath"
+description: "OLMo3_4T 10B-microanneal with mj's new math data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: tinyMATH_MIND # 1.139B tokens | 1.139B requested
+    target_ratio: 0.1139
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: fineMath # 34.06B tokens | 3.861B requested
+    target_ratio: 0.3861  
+    paths:
+    - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-swallowcode.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-swallowcode.yaml
new file mode 100644
index 000000000..88d159430
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-swallowcode.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj-swallowcode
+description: microanneal just for (10B tokens of) swallowcode
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/original_data/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mmwpm.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mmwpm.yaml
new file mode 100644
index 000000000..520de4400
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mmwpm.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-megamathwebpromax"
+description: "OLMo3_4T 10B-microanneal with 5T tokens of megamathwebpromax"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 73.9B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_omr-rewrites.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_omr-rewrites.yaml
new file mode 100644
index 000000000..a9dec5338
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_omr-rewrites.yaml
@@ -0,0 +1,46 @@
+name: "microanneal-omr-rewrites"
+description: "OLMo3_4T 10B-microanneal with a bunch of openmathreasoning rewrites"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: openmathreasoning # 2.966B tokens | 2.965B requested 
+    target_ratio: 0.2965
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-dialogue-2students-error-correct/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-sleek/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason-noheader/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-reformulation/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-lecture/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-planning/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-clean-thoughts/allenai/dolma2-tokenizer/*.npy
+  - name: fineMath # 34.06B tokens | 2.034B requested
+    target_ratio: 0.2035
+    paths:
+    - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_web.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_web.yaml
new file mode 100644
index 000000000..668b6e9e4
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_web.yaml
@@ -0,0 +1,31 @@
+name: "microanneal-web_ad18"
+description: "OLMo3_4T 10B-microanneal with just web, cc-all-dressed_18"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 24.7B tokens | 10B requested
+    target_ratio: 1.0
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy      
+
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-25B-milli_mmwpm.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-25B-milli_mmwpm.yaml
new file mode 100644
index 000000000..08dcc356f
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-25B-milli_mmwpm.yaml
@@ -0,0 +1,34 @@
+name: "millianneal-megamathwebpromax"
+description: "OLMo3_4T 25B-anneal with 12.5T tokens of megamathwebpromax"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 25_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 25B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 73.9B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-mjicro_mj-swallowmath.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-mjicro_mj-swallowmath.yaml
new file mode 100644
index 000000000..938fca9a5
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-mjicro_mj-swallowmath.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-swallowmath
+description: microanneal just for (all of) swallowmath
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 7306856190
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/original_data/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_conv.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_conv.yaml
new file mode 100644
index 000000000..56cf7e170
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_conv.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-kodkode_4o_conv
+description: nanonanneal just for kodkode version 4o_conv
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 442515950
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_4o_conv_tokens/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qst.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qst.yaml
new file mode 100644
index 000000000..3e226517c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qst.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-kodkode_4o_qst
+description: nanonanneal just for kodkode version 4o_qst
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 378311140
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_4o_qst_tokens/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qstp.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qstp.yaml
new file mode 100644
index 000000000..2b43c618b
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qstp.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-kodkode_4o_qstp
+description: nanonanneal just for kodkode version 4o_qstp
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 438489702
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_4o_qstp_tokens/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_conv.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_conv.yaml
new file mode 100644
index 000000000..92cd24884
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_conv.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-kodkode_r1_conv
+description: nanonanneal just for kodkode version r1_conv
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 2409101190
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_r1_conv_tokens/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qst.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qst.yaml
new file mode 100644
index 000000000..f67d651bd
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qst.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-kodkode_r1_qst
+description: nanonanneal just for kodkode version r1_qst
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 377930418
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_r1_qst_tokens/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qstp.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qstp.yaml
new file mode 100644
index 000000000..3f1af1281
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qstp.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-kodkode_r1_qstp
+description: nanonanneal just for kodkode version r1_qstp
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 365577152
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_r1_qstp_tokens/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-10B-megamatt_test_og.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-10B-megamatt_test_og.yaml
new file mode 100644
index 000000000..86cb06e3a
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-10B-megamatt_test_og.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-megamattwebpromax-megamathtest"
+description: "OLMo3_4T 10B-microanneal with 5T tokens of megamath (original, pre-rewrite)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 73.9B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/megamath_web_pro_max/0731_scratch/og_tokens/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-5B-megamatt_test_rewrite.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-5B-megamatt_test_rewrite.yaml
new file mode 100644
index 000000000..e482d5806
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-5B-megamatt_test_rewrite.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-megamattwebpromax-megamathtest_rewrites"
+description: "OLMo3_4T 5B-microanneal with ~2.5T tokens of megamatt, rewritten"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 5_373_701_500
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing: 
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | ~2.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 2.68B tokens | 2.68B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/megamath_web_pro_max/0731_scratch/rewrite_tokens/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-mjicro-megamatt.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-mjicro-megamatt.yaml
new file mode 100644
index 000000000..ec5cb6408
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-mjicro-megamatt.yaml
@@ -0,0 +1,34 @@
+name: "mjicroanneal-megamatt"
+description: "OLMo3_4T mjicroanneal with 3.88B tokens of megamatt"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 1
+gpus: 8
+preemptible: false
+max_tokens: 7_767_300_882
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6.17B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: megamatt # 3.88B tokens
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-sparkle-motion/**/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_clean_thoughts.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_clean_thoughts.yaml
new file mode 100644
index 000000000..b41f494b3
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_clean_thoughts.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_clean_thoughts
+description: nanonanneal just for omr_clean_thoughts
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 736_917_144
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-clean-thoughts/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_dialogue_2students_error_correct.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_dialogue_2students_error_correct.yaml
new file mode 100644
index 000000000..5a7a0aa87
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_dialogue_2students_error_correct.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_dialogue_2students_error_correct
+description: nanonanneal just for omr_dialogue_2students_error_correct
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 291_366_340
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-dialogue-2students-error-correct/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_full_thoughts.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_full_thoughts.yaml
new file mode 100644
index 000000000..fb476de1f
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_full_thoughts.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_full_thoughts
+description: nanonanneal just for omr_full_thoughts
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 1_701_719_376
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason.yaml
new file mode 100644
index 000000000..82abf23f5
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_metareason
+description: nanonanneal just for omr_metareason
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 716_777_332
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason_noheader.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason_noheader.yaml
new file mode 100644
index 000000000..4ec457485
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason_noheader.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_metareason_noheader
+description: nanonanneal just for omr_metareason_noheader
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 725_679_384
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason-noheader/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_sleek.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_sleek.yaml
new file mode 100644
index 000000000..1c4553634
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_sleek.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_sleek
+description: nanonanneal just for omr_sleek
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 454_093_930
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-sleek/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_lecture.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_lecture.yaml
new file mode 100644
index 000000000..9f4c9e65d
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_lecture.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_teacher_student_lecture
+description: nanonanneal just for omr_teacher_student_lecture
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 913_258_022
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-lecture/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_planning.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_planning.yaml
new file mode 100644
index 000000000..e60e42c85
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_planning.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_teacher_student_planning
+description: nanonanneal just for omr_teacher_student_planning
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 184_938_956
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-planning/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_reformulation.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_reformulation.yaml
new file mode 100644
index 000000000..ef79a0d63
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_reformulation.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj-omr_teacher_student_reformulation
+description: nanonanneal just for omr_teacher_student_reformulation
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 207_125_496
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-reformulation/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH.yaml
new file mode 100644
index 000000000..72b2c502d
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH.yaml
@@ -0,0 +1,35 @@
+name: nanoanneal-mj-tinyMATH_BOTH
+description: PoT/MIND only side of TinyMATH
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 2_309_538_288
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH_mjim.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH_mjim.yaml
new file mode 100644
index 000000000..e5175fc41
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH_mjim.yaml
@@ -0,0 +1,36 @@
+name: nanoanneal-mj-tinyMATH_BOTH_mjim
+description: PoT/MIND only side of TinyMATH ++ mjim
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 2_528_755_552
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_MIND.yaml
new file mode 100644
index 000000000..52181afd5
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_MIND.yaml
@@ -0,0 +1,34 @@
+name: nanoanneal-mj-tinyMATH_MIND
+description: MIND only side of TinyMATH
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 1_828_357_020
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_PoT.yaml
new file mode 100644
index 000000000..1ff74d9ec
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_PoT.yaml
@@ -0,0 +1,34 @@
+name: nanoanneal-mj-tinyMATH_PoT
+description: PoT only side of TinyMATH
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 512_070_372
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_og.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_og.yaml
new file mode 100644
index 000000000..6b46f222c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_og.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj_stackedurw-sanity_og
+description: stack-edu python rewrites sanity check | og version 
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 1882533086
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/stack-edu/rewrites/sanity_check/og_tokens/*.npy      
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_rewrite.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_rewrite.yaml
new file mode 100644
index 000000000..0af61956c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_rewrite.yaml
@@ -0,0 +1,33 @@
+name: nanoanneal-mj_stackedurw-sanity_rewrite
+description: stack-edu python rewrites sanity check | rewrite version 
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 2046268996
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/stack-edu/rewrites/sanity_check/rewrite_tokens/*.npy      
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-1B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-1B.yaml
new file mode 100644
index 000000000..fb952f576
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-1B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-1B
+description: OLMo3_4T 10B-microanneal with 1B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 1_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-2B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-2B.yaml
new file mode 100644
index 000000000..4441991db
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-2B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-2B
+description: OLMo3_4T 10B-microanneal with 2B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 2_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-3B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-3B.yaml
new file mode 100644
index 000000000..d503e4375
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-3B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-3B
+description: OLMo3_4T 10B-microanneal with 3B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 3_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-4B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-4B.yaml
new file mode 100644
index 000000000..43e2c4270
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-4B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-4B
+description: OLMo3_4T 10B-microanneal with 4B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 4_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-5B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-5B.yaml
new file mode 100644
index 000000000..37ca2ec7b
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-5B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-5B
+description: OLMo3_4T 10B-microanneal with 5B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 5_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-6B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-6B.yaml
new file mode 100644
index 000000000..b720e5f7b
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-6B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-6B
+description: OLMo3_4T 10B-microanneal with 6B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 6_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-7B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-7B.yaml
new file mode 100644
index 000000000..3cc791fcc
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-7B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-7B
+description: OLMo3_4T 10B-microanneal with 7B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 7_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-8B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-8B.yaml
new file mode 100644
index 000000000..82bb80ed1
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-8B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-8B
+description: OLMo3_4T 10B-microanneal with 8B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 8_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-9B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-9B.yaml
new file mode 100644
index 000000000..bd86da485
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-9B.yaml
@@ -0,0 +1,33 @@
+name: microanneal-dolminos_math_baseline-9B
+description: OLMo3_4T 10B-microanneal with 9B dolminos math dataset tokens
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 9_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos_math
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ1.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ1.yaml
new file mode 100644
index 000000000..8dc900261
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ1.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcode2_scor_lintQ1"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the bottom quartile of linted data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 5.28B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q1/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ2.yaml
new file mode 100644
index 000000000..ffe8c3e6a
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ2.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcode2_scor_lintQ2"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the second quartile of linted data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 6.07B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q2/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ3.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ3.yaml
new file mode 100644
index 000000000..049d323fd
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ3.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcode2_scor_lintQ3"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the third quartile of linted data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 6.37B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q3/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ4.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ4.yaml
new file mode 100644
index 000000000..f48dab7e6
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ4.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-swallowcode2_scor_lintQ4"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the fourth quartile of linted data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 4.54B tokens | 5B requested
+    target_ratio: 0.5
+    repetition_factor: 1.2
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q4/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ1.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ1.yaml
new file mode 100644
index 000000000..eb3abc365
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ1.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-swallowcode2_scor_sgcrQ1"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the bottom quartile of SGCR data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 2.11B tokens | 5B requested
+    target_ratio: 0.5
+    repetition_factor: 3.0
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q1/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ2.yaml
new file mode 100644
index 000000000..6c6283edc
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ2.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcode2_scor_sgcrQ2"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the second quartile of SGCR data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 2.11B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q2/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ3.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ3.yaml
new file mode 100644
index 000000000..5cb33ccac
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ3.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcode2_scor_sgcrQ3"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the third quartile of SGCR data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 11.98B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q3/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ4.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ4.yaml
new file mode 100644
index 000000000..77d9b023b
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ4.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-swallowcode2_scor_sgcrQ4"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the fourth quartile of SGCR data"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 2.06B tokens | 5B requested
+    target_ratio: 0.5
+    repetition_factor: 3.0
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q4/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2.yaml
new file mode 100644
index 000000000..52d78302e
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2.yaml
@@ -0,0 +1,36 @@
+name: "microanneal-swallowcode2_scor"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 22B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q1/*.npy
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q2/*.npy
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q3/*.npy
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q4/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode_scor_improved_code.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode_scor_improved_code.yaml
new file mode 100644
index 000000000..fd744a753
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode_scor_improved_code.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-swallowcode_scor"
+description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode with SCOR rewrites (improved code only)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 1
+gpus: 8
+preemptible: false
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 18.8B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode2.yaml
new file mode 100644
index 000000000..7c9073db7
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode2.yaml
@@ -0,0 +1,36 @@
+name: "millianneal-swallowcode2_scor"
+description: "OLMo3_4T 25B-anneal with 12.5T tokens of swallowcode2 with SCOR rewrites (improved code only)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 25_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 25B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 # 22B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q1/*.npy
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q2/*.npy
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q3/*.npy
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q4/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_scor_improved_code.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_scor_improved_code.yaml
new file mode 100644
index 000000000..1fb9d82c0
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_scor_improved_code.yaml
@@ -0,0 +1,34 @@
+name: "millianneal-swallowcode_scor"
+description: "OLMo3_4T 25B-anneal with 12.5T tokens of swallowcode with SCOR rewrites (improved code only)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 25_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 25B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 18.8B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_sgcr_improved_code.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_sgcr_improved_code.yaml
new file mode 100644
index 000000000..8c71518be
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_sgcr_improved_code.yaml
@@ -0,0 +1,34 @@
+name: "millianneal-swallowcode_sgcr"
+description: "OLMo3_4T 25B-anneal with 12.5T tokens of swallowcode with SGCR rewrites (improved code only)"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 25_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 25B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: megamathwebpromax # 14.37B tokens | 12.5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/sgcr_improved_code_tokens/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample.yaml
new file mode 100644
index 000000000..867c8048c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcodeMulti_sample"
+description: "OLMo3_4T | Using the SGCR rewrites of the multiPL sample"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 7_045_850_528
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 #
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_sgcr_data/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sampleOG.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sampleOG.yaml
new file mode 100644
index 000000000..4e71abc11
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sampleOG.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcodeMulti_sampleOG"
+description: "OLMo3_4T | Using the OG data rewritten in the multiPL sample"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 6_860_126_676
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 #
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_sgcr_data_og/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scor.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scor.yaml
new file mode 100644
index 000000000..b7c75f85c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scor.yaml
@@ -0,0 +1,33 @@
+name: "microanneal-swallowcodeMulti_sample_scor"
+description: "OLMo3_4T | Using the SCOR rewrites of the multiPL sample"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_915_766_938
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: normal
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web # 6B tokens | 5.45B requested
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 #
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_scor2_data/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scorpy.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scorpy.yaml
new file mode 100644
index 000000000..cc495ab77
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scorpy.yaml
@@ -0,0 +1,34 @@
+name: "microanneal-swallowcodeMulti_sample_scorPy"
+description: "OLMo3_4T | Using the (pythonic) SCOR rewrites of the multiPL sample"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 15_518_309_282
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: swallowcode2 
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_scor_data_py/allenai/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2.yaml
new file mode 100644
index 000000000..08e60ceb6
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj-swallowmathtest-swallowmatt2
+description: microanneal just for 5B tokens of swallowmath rewrites (v2, from megamath_web_pro)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: false
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web/swallow_mmw/beaker_outputs/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2OG.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2OG.yaml
new file mode 100644
index 000000000..a8aec2b05
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2OG.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj-swallowmathtest-swallowmatt2OG
+description: microanneal just for 5B tokens of swallowmath v2 data (before rewrites, i.e. sanitized megamathwebpro) 
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web/megamath_web_pro_max_sansFm4p_dedup/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2_restart.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2_restart.yaml
new file mode 100644
index 000000000..a7fe67539
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2_restart.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj-swallowmathtest-swallowmatt2_restart
+description: microanneal just for 5B tokens of swallowmath rewrites (v2, from megamath_web_pro)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/mattj/microanneal-mj-swallowmathtest-swallowmatt2-3b4f423b/step3000
+load_state: true
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/megamath_web/swallow_mmw/beaker_outputs/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmattPROPER.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmattPROPER.yaml
new file mode 100644
index 000000000..327e2ee92
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmattPROPER.yaml
@@ -0,0 +1,33 @@
+name: microanneal-mj-swallowmathtest-swallowmattPROPER
+description: microanneal just for 5B tokens of swallowmath rewrites
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_mix.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_mix.yaml
new file mode 100644
index 000000000..3d4941058
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_mix.yaml
@@ -0,0 +1,37 @@
+name: mjicroanneal-mj-swallowmathtest-swallowmatt-mix_proper
+description: microanneal to have mix of rewrite and og data
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: OG DATA
+    target_ratio: 0.25
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/rewrite_diversity_check/og_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: REWRITE_DATA
+    target_ratio: 0.25
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/rewrite_diversity_check/rewrite_tokens/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_rewritex2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_rewritex2.yaml
new file mode 100644
index 000000000..5959307ae
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_rewritex2.yaml
@@ -0,0 +1,34 @@
+name: mjicroanneal-mj-swallowmathtest-swallowmatt-rewritex2_proper
+description: microanneal to have 2x rewrites 
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    repetition_factor: 2.0
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/rewrite_diversity_check/rewrite_tokens/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_fm4p.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_fm4p.yaml
new file mode 100644
index 000000000..2573b003a
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_fm4p.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-swallowmathtest-fm4p
+description: microanneal just for just the finemath4plus version of swallowmath (i.e., og requests)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 13783479380
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/tokens/og_tokens/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_swallowmatt.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_swallowmatt.yaml
new file mode 100644
index 000000000..aca1cb9ed
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_swallowmatt.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-swallowmathtest-swallowmatt
+description: microanneal just for just what we have as swallowmatt rewrites of swallowmath go (i.e., the non-llama rewrites)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 8673251024
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/tokens/rewritten_tokens/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATH2PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATH2PoT.yaml
new file mode 100644
index 000000000..0af38d779
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATH2PoT.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal2.5-mj-tinyMATHPoT2
+description: mjicroanneal just for the PoT of TinyMATH2
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 414_687_378
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo25/step1413814
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATHPoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATHPoT.yaml
new file mode 100644
index 000000000..972c18336
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATHPoT.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal2.5-mj-tinyMATHPoT
+description: mjicroanneal just for the decon of the OG tinyMATH
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 481_181_268
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo25/step1413814
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2MIND.yaml
new file mode 100644
index 000000000..09875393c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2MIND.yaml
@@ -0,0 +1,34 @@
+name: mjicroanneal-mj-tinyMATH2MIND
+description: mjicroanneal just for the MIND of TinyMATH2
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 2_264_755_032
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_ps_merged/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_2stud_merged/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2PoT.yaml
new file mode 100644
index 000000000..b7656ded3
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2PoT.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATHPoT2
+description: mjicroanneal just for the PoT of TinyMATH2
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 414_687_378
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3MIND.yaml
new file mode 100644
index 000000000..8b3c5b779
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3MIND.yaml
@@ -0,0 +1,34 @@
+name: mjicroanneal-mj-tinyMATH3MIND
+description: mjicroanneal just for the MIND of TinyMATH3
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 2_158_600_528
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/2stud_tokens/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/ps_tokens/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3PoT.yaml
new file mode 100644
index 000000000..bb88d1818
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3PoT.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATH3PoT
+description: mjicroanneal just for the PoT of TinyMATH3
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 1_078_693_624
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/pot_data/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND.yaml
new file mode 100644
index 000000000..d9a5768d1
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND.yaml
@@ -0,0 +1,34 @@
+name: mjicroanneal-mj-tinyMATH4MIND
+description: mjicroanneal just for the MIND of TinyMATH4
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 774_164_830
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/2stud_data/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/ps_data/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND_uncurse.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND_uncurse.yaml
new file mode 100644
index 000000000..00a074b6c
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND_uncurse.yaml
@@ -0,0 +1,34 @@
+name: mjicroanneal-mj-tinyMATH4MIND_uncurse
+description: mjicroanneal just for the MIND of TinyMATH4 (uncurse version)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 774_164_830
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 42_069
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: normal
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/2stud_data/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/ps_data/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT.yaml
new file mode 100644
index 000000000..f821a24f1
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATH4PoT
+description: mjicroanneal just for the PoT of TinyMATH4
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 786_891_408
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2.yaml
new file mode 100644
index 000000000..fe7a245cb
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATH4PoT2
+description: mjicroanneal just for the PoT2 of TinyMATH4
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 479_294_178
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data2/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2_uncurse.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2_uncurse.yaml
new file mode 100644
index 000000000..9272e4425
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2_uncurse.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATH4PoT2_uncurse
+description: mjicroanneal just for the PoT2 of TinyMATH4 (rerun, uncursed?)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 479_294_178
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 42_069
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: normal
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data2/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHMIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHMIND.yaml
new file mode 100644
index 000000000..b53792ca8
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHMIND.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATHMIND
+description: mjicroanneal just for the MIND of TinyMATH
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 1_797_460_446
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHPoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHPoT.yaml
new file mode 100644
index 000000000..f6b66d223
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHPoT.yaml
@@ -0,0 +1,33 @@
+name: mjicroanneal-mj-tinyMATHPoT
+description: mjicroanneal just for the decon of the OG tinyMATH
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 481_181_268
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
\ No newline at end of file
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy.yaml
new file mode 100644
index 000000000..4c669e2ea
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy.yaml
@@ -0,0 +1,41 @@
+name: mjicroanneal-mj-tinyMATH_allholy
+description: mjicroanneal for the MIND/PoT of TinyMATH's 1-4 (minus the cursed sets)
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 8_554_855_728
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: normal
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_ps_merged/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_2stud_merged/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/2stud_tokens/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/ps_tokens/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/pot_data/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/2stud_data/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/ps_data/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy_decon.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy_decon.yaml
new file mode 100644
index 000000000..42ed72632
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy_decon.yaml
@@ -0,0 +1,40 @@
+name: mjicroanneal-mj-tinyMATH_allholy_decon
+description: mjicroanneal for the MIND/PoT of TinyMATH's 1-4 (minus the cursed sets) + DECON
+budget: ai2/oe-base
+workspace: ai2/olmo-3-microanneals
+nodes: 8
+gpus: 8
+preemptible: true
+max_tokens: 8_259_659_924
+global_batch_size: 2_097_152
+sequence_length: 8_192
+seed: 1_337
+model: olmo2_7B_swafix
+tokenizer: dolma2
+priority: normal
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8_192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: web
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy
+  - name: SPECIFIC_HQ_DATA
+    target_ratio: 0.5
+    paths:
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_2stud_merged-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_ps_merged-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/2stud_data-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/ps_data-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/pot_data-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data2-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/smoke-test-swafix-olmo3_7b-anneal.yaml b/src/cookbook/recipes/olmo3-midtraining/smoke-test-swafix-olmo3_7b-anneal.yaml
new file mode 100644
index 000000000..830eb59b8
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/smoke-test-swafix-olmo3_7b-anneal.yaml
@@ -0,0 +1,29 @@
+name: "olmo3-7b_microanneal-smoke-test"
+description: "OLMo3 7b microanneal smoke test with swa fix"
+budget: "ai2/oe-base"
+workspace: "ai2/oe-data"
+nodes: 2
+gpus: 8
+preemptible: true
+max_tokens: 1_000_000_000
+global_batch_size: 2097152
+sequence_length: 4096
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step172000
+load_state: false
+dataset:
+  sources:
+  - name: code
+    target_ratio: 1.0
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/**/*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl-v2.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl-v2.yaml
new file mode 100644
index 000000000..35571ae49
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl-v2.yaml
@@ -0,0 +1,33 @@
+name: "olmo2-7b_10b-anneal_fim-stack-edu-hq-weighted-pl-dclm-v2"
+description: "OLMo2 7b anneal to 10B Tokens for fim-stack-edu hq weighted-pl and dclm baseline"
+budget: "ai2/oe-training"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 4096
+seed: 1337
+model: "olmo2_7B"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+annealing:
+  enabled: true
+  initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
+load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
+load_state: false
+dataset:
+  sources:
+  - name: fim-stackedu-hq-v2-weighted-pl
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v2/allenai/dolma2-tokenizer/**/*.npy
+  - name: dclm-baseline-olmo2
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl.yaml
new file mode 100644
index 000000000..7c521f171
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl.yaml
@@ -0,0 +1,33 @@
+name: "olmo2-7b_10b-anneal_fim-stack-edu-hq-weighted-pl-dclm"
+description: "OLMo2 7b anneal to 10B Tokens for fim-stack-edu hq weighted-pl and dclm baseline"
+budget: "ai2/oe-training"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 4096
+seed: 1337
+model: "olmo2_7B"
+tokenizer: "dolma2"
+priority: urgent
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 8192
+scheduler_type: linear
+warmup_steps: 0
+annealing:
+  enabled: true
+  initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA
+load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/
+load_state: false
+dataset:
+  sources:
+  - name: fim-stackedu-hq-v2-weighted-pl
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: dclm-baseline-olmo2
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy
diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-cmprs-filter.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-cmprs-filter.yaml
new file mode 100644
index 000000000..f27aae1e0
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-cmprs-filter.yaml
@@ -0,0 +1,69 @@
+name: "olmo3-7b-10B-all-sources-fim-stackedu-hq-filter-weighted-pl-v0-compress-filter"
+description: "OLMo3 7b 10B with all sources + fim-stack-edu-hq-filter-weighted-pl-v0-compress-filter"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.452
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  # 19.9B
+  - name: code_fim-stackedu-hq-filter-v0-compress-filter
+    target_ratio: 0.198
+    paths:
+    - s3://ai2-llm/preprocessed/stack-edu-fim/fim-weighted-pl-20B-decon_cmprs-filter-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-reweight-v1.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-reweight-v1.yaml
new file mode 100644
index 000000000..ee45b1165
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-reweight-v1.yaml
@@ -0,0 +1,68 @@
+name: "olmo3-7b-10B-all-sources-fim-stackedu-hq-filter-weighted-pl-v0"
+description: "OLMo3 7b 10B with all sources + fim-stack-edu-hq-filter-weighted-pl-v0"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.35
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code_fim-stackedu-hq-filter-v0
+    target_ratio: 0.30
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl.yaml
new file mode 100644
index 000000000..9caaf2ce1
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl.yaml
@@ -0,0 +1,68 @@
+name: "olmo3-7b-10B-all-sources-fim-stackedu-hq-filter-weighted-pl-v0"
+description: "OLMo3 7b 10B with all sources + fim-stack-edu-hq-filter-weighted-pl-v0"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: hqweb
+    target_ratio: 0.45
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
+  - name: code_fim-stackedu-hq-filter-v0
+    target_ratio: 0.2
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: finemath
+    # 10% less the .0195 over 10% for dolminos2math
+    target_ratio: 0.0806
+    paths:
+    - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy
+  - name: dolminos2math
+    target_ratio: 0.1194
+    paths:
+    # 10.7B
+    - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy
+    # 1.25B
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy
+    - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy
+  - name: reddit
+    target_ratio: 0.089
+    paths:
+    - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy
+  - name: instruction
+    target_ratio: 0.011
+    paths:
+    - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy
+  - name: r1_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy
+  - name: qwq_reasoning
+    target_ratio: 0.02375
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy
+  - name: gemini_reasoning
+    target_ratio: 0.0025
+    paths:
+    - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy
+
diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_fim-stack-edu-hq-weighted-pl.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_fim-stack-edu-hq-weighted-pl.yaml
new file mode 100644
index 000000000..61766e23b
--- /dev/null
+++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_fim-stack-edu-hq-weighted-pl.yaml
@@ -0,0 +1,34 @@
+name: "olmo3-7b-10B-micro-fim-stack-edu-hq-weighted-pl-alldressed"
+description: "OLMo3 7b anneal to 10B Tokens for fim-stack-edu hq weighted-pl-v0 + all dressed hq web"
+budget: "ai2/oe-base"
+workspace: "ai2/olmo-3-microanneals"
+nodes: 4
+gpus: 8
+preemptible: true
+max_tokens: 10_000_000_000
+global_batch_size: 2097152
+sequence_length: 8192
+seed: 1337
+model: "olmo2_7B_swafix"
+tokenizer: "dolma2"
+priority: high
+cluster: ai2/augusta-google-1
+rank_microbatch_size: 16384
+scheduler_type: linear
+warmup_steps: 0
+activation_checkpointing: true
+annealing:
+  enabled: true
+load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000
+load_state: false
+dataset:
+  sources:
+  - name: fim-stackedu-hq-v2-weighted-pl
+    target_ratio: 0.5
+    paths:
+    - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy
+  - name: hqweb
+    target_ratio: 0.5
+    repetition_factor: 1.5
+    paths:
+    - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy
diff --git a/src/cookbook/utils/config.py b/src/cookbook/utils/config.py
index ee8d6bf0a..85eb8c0ef 100644
--- a/src/cookbook/utils/config.py
+++ b/src/cookbook/utils/config.py
@@ -196,10 +196,7 @@ def build_train_config(config_path: Path, run_name: str, group_id: str, beaker_u
         trainer = config.trainer.build(train_module, data_loader)
 
         # If we have a load path and there is no checkpoint in the save folder, load the checkpoint from the load path.
-        if (
-            not trainer.maybe_load_checkpoint(trainer.save_folder, load_trainer_state=base_config.load_state)
-            and base_config.load_path
-        ):
+        if not trainer.maybe_load_checkpoint(trainer.save_folder) and base_config.load_path:
             logger.info(
                 f"Loading checkpoint from {base_config.load_path} and load_trainer_state: {base_config.load_state}"
             )
@@ -241,7 +238,7 @@ def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLa
             budget=group.config.budget or "ai2/oe-data",
             workspace=group.config.workspace,
             preemptible=group.config.preemptible,
-            beaker_image="petew/olmo-core-tch270cu126",
+            beaker_image="petew/olmo-core-tch270cu128",
             priority=group.config.priority,
             env_vars=[BeakerEnvVar(name="NCCL_DEBUG", value="INFO" if group.config.nccl_debug else "WARN")],
             env_secrets=[
@@ -253,6 +250,7 @@ def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLa
                 BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"),
                 BeakerEnvSecret(name="GOOGLE_CLOUD_PROJECT", secret="GOOGLE_CLOUD_PROJECT"),
             ],
+            retries=3,
             setup_steps=[
                 'git clone "$REPO_URL"',
                 "conda shell.bash activate base",
diff --git a/src/cookbook/utils/data.py b/src/cookbook/utils/data.py
index c293b5c8f..ae2fba547 100644
--- a/src/cookbook/utils/data.py
+++ b/src/cookbook/utils/data.py
@@ -76,7 +76,7 @@ def get_token_counts_and_ratios(
         if scheme not in filesystems:
             filesystems[scheme] = get_filesystem_for_scheme(scheme)
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
         for source in source_configs:
             # Get the appropriate filesystem for this source
             scheme = next(iter({urlparse(path).scheme for path in source.paths}), "local")