diff --git a/README.md b/README.md index 441defb60..8dae4478e 100644 --- a/README.md +++ b/README.md @@ -315,3 +315,10 @@ All PMR CLI commands support the following options: | `--script` | `-s` | None | Path to script file or directory to execute | Note that you can provide either `--command` or `--script`, but not both. When using `--script` with a directory path, all executable files in that directory will be distributed across the instances. + + +# Midtraining utilities +I (mj) built out some utilities to minimize manual labor for common tasks for midtraining. +- [`scripts/gs2weka.py`](scripts/gs2weka.py): This script finds the latest checkpoint for a given model configuration and copies it from Google Cloud Storage to Weka storage using olmo-cookbook. Run with `python scripts/gs2weka.py ` to automatically detect your Beaker account and process the latest checkpoint, or use --beaker-name to specify a different account name. +- [`scripts/convert_from_config.py`](scripts/convert_from_config.py): This script finds the latest checkpoint in Weka storage for a given model configuration and converts it to HuggingFace format using olmo-cookbook-eval. Run with `python scripts/convert_from_config.py ` to automatically detect your Beaker account and convert the latest checkpoint, with optional --overwrite flag to reconvert existing checkpoints. +- [`scripts/olmo3_midtrain_eval.sh`](scripts/olmo3_midtrain_eval.sh): This script runs OLMo3 midtraining evaluations on a given checkpoint path using two different task suites (midtrain and main). Run with `bash scripts/olmo3_midtrain_eval.sh ` where the checkpoint path should point to a converted HuggingFace format checkpoint (e.g., ending in -hf). diff --git a/pyproject.toml b/pyproject.toml index 3f20e6ebc..5e2bf491b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ checkpoints = [ "boto3" ] all = [ - "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@c779ca546cc3194e73e7491aaefcdffbed042c65", + "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@tylerr/olmo3-scripts-swafix-foreachopt", "beaker-py>=1,<2", "GitPython>=3.0,<4.0", "wandb", diff --git a/scripts/convert_from_config.py b/scripts/convert_from_config.py new file mode 100755 index 000000000..061649591 --- /dev/null +++ b/scripts/convert_from_config.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +""" +Script to process YAML file and run olmo-cookbook command with latest checkpoint +""" +import argparse +import re +import subprocess +import sys +from pathlib import Path + +import yaml + + +def run_command(cmd, shell=False, errs_okay=False): + """Run a shell command and return stdout""" + try: + if shell: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) + else: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return result.stdout.strip() + except subprocess.CalledProcessError as e: + print(f"Error running command: {' '.join(cmd) if isinstance(cmd, list) else cmd}") + print(f"Error: {e.stderr}") + if not errs_okay: + sys.exit(1) + raise e + + +def get_yaml_name(yaml_file): + """Extract the 'name' attribute from YAML file""" + try: + with open(yaml_file, "r") as f: + data = yaml.safe_load(f) + + if "name" not in data: + print(f"Error: 'name' attribute not found in {yaml_file}") + sys.exit(1) + + return data["name"] + except Exception as e: + print(f"Error reading YAML file {yaml_file}: {e}") + sys.exit(1) + + +def get_beaker_name(): + """Get the NAME from 'beaker account whoami' output""" + output = run_command(["beaker", "account", "whoami"]) + + # Parse the table output to extract NAME + lines = output.strip().split("\n") + if len(lines) < 2: + print("Error: Unexpected output from 'beaker account whoami'") + sys.exit(1) + + # Look for the data row (skip header) + for line in lines[1:]: + parts = line.split() + if len(parts) >= 2: + return parts[1] # NAME is the second column + + print("Error: Could not extract NAME from beaker account whoami output") + sys.exit(1) + + +def find_latest_checkpoint(beaker_name, yaml_name): + """Find the latest checkpoint directory in weka""" + + weka_path = f"weka://oe-training-default/ai2-llm/checkpoints/{beaker_name}/{yaml_name}-*" + + # Convert weka:// path to s3:// path for s5cmd + s3_path = weka_path.replace("weka://oe-training-default/", "s3://oe-training-default/") + + # Add wildcard to check for any files in the directory + s3_path_wildcard = f"{s3_path}/*" + + print(f"Checking if weka path exists: {weka_path}") + print(f"Using s5cmd to check: {s3_path_wildcard}") + + cmd = [ + "s5cmd", + "--profile", + "WEKA", + "--endpoint-url", + "https://weka-aus.beaker.org:9000", + "ls", + s3_path_wildcard, + ] + + try: + output = run_command(cmd, errs_okay=True) + if not output: + print(f"No checkpoints found with prefix: {prefix}") + sys.exit(1) + + # Get all matching paths + paths = output.strip().split("\n") + + # Sort paths to get the latest one (lexicographically) + paths = [_.split(" ")[-1].strip() for _ in paths] + ckpts = set() + for p in paths: + re_string = yaml_name + r"-[0-9a-f]{8}/step\d+/" + if re.match(re_string, p): + ckpts.add(re.match(re_string, p).group()) + assert ( + len(ckpts) > 0 + ), "No valid checkpoints found??? [this should assert should never fail if we got here to begin with]" + max_ckpt = max(ckpts) + print(max_ckpt) + return "weka://oe-training-default/ai2-llm/checkpoints/%s/%s" % (beaker_name, max_ckpt) + + except subprocess.CalledProcessError as e: + print("No weka paths found!") + print( + f"Make sure you have access to weka://oe-training-deafult/ai2-llm/checkpoints/{beaker_name}/{yaml_name}-* directories" + ) + raise e + # sys.exit(1) + except Exception as e: + print("ERR CODE ", e) + raise e + + +def check_hf_path_exists(latest_ckpt): + """Check if the corresponding weka path already exists""" + # Convert gs:// path to weka:// path + hf_path = latest_ckpt.rstrip("/") + "-hf/*" + + print(f"Checking if weka path exists: {hf_path}") + cmd = [ + "s5cmd", + "--profile", + "WEKA", + "--endpoint-url", + "https://weka-aus.beaker.org:9000", + "ls", + hf_path, + ] + + # Convert weka:// path to s3:// path for s5cmd + hf_path = hf_path.replace("weka://oe-training-default/", "s3://oe-training-default/") + + print(f"Checking if weka path exists: {hf_path}") + cmd = [ + "s5cmd", + "--profile", + "WEKA", + "--endpoint-url", + "https://weka-aus.beaker.org:9000", + "ls", + hf_path, + ] + + try: + # Run the command - if it succeeds, the path exists + output = run_command(cmd, errs_okay=True) + print(f"āœ… Weka path exists - found %s files:" % len(output.split("\n"))) + return True + except subprocess.CalledProcessError as e: + # If the command fails, the path doesn't exist + print(f"āŒ Weka path does not exist (s5cmd failed as expected)") + return False + + +def run_olmo_cookbook(weka_path): + """Run the olmo-cookbook command with the GCS path""" + print("Converting %s" % weka_path) + weka_path = weka_path.replace("weka://", "/").rstrip("/") + cmd = [ + "olmo-cookbook-eval", + "convert", + weka_path, + "-t", + "olmo-core-v2", + "--use-beaker", + "--huggingface-transformers-git-url", + "https://github.com/2015aroras/transformers.git", + "--huggingface-transformers-commit-hash", + "ae3889ced6ed7362e5883671fc6dc4cb4fece5fa", + "--olmo-core-v2-commit-hash", + "57a04d0b69047d797c96eede056a211e75b5914a", + ] + print(f"Running: {' '.join(cmd)}") + + try: + # Run the command and stream output in real-time + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True + ) + + beaker_url = None + beaker_url_pattern = re.compile(r"https://beaker\.org/ex/[A-Z0-9]+") + + for line in process.stdout: + print(line, end="") + + # Look for the beaker URL in the output + match = beaker_url_pattern.search(line) + if match: + beaker_url = match.group(0) + + process.wait() + + if process.returncode != 0: + print(f"Error: olmo-cookbook command failed with return code {process.returncode}") + sys.exit(1) + + # Print the extracted Beaker URL + if beaker_url: + print(f"\n" + "=" * 60) + print(f"šŸ”— Beaker Experiment URL: {beaker_url}") + print(f"=" * 60) + return beaker_url + else: + print("\nWarning: Could not extract Beaker experiment URL from output") + + except Exception as e: + print(f"Error running olmo-cookbook: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser(description="Process YAML file and run olmo-cookbook with latest checkpoint") + parser.add_argument("yaml_file", help="Path to the YAML file") + parser.add_argument("--beaker-name", required=False, default=None) + parser.add_argument("--overwrite", required=False, type=bool, default=False) + args = parser.parse_args() + + # Validate input file exists + if not Path(args.yaml_file).exists(): + print(f"Error: YAML file {args.yaml_file} does not exist") + sys.exit(1) + + print(f"Processing YAML file: {args.yaml_file}") + + # Step 1: Get name from YAML + yaml_name = get_yaml_name(args.yaml_file) + print(f"YAML name: {yaml_name}") + + # Step 2: Get beaker name + if args.beaker_name == None: + beaker_name = get_beaker_name() + else: + beaker_name = args.beaker_name + print(f"Beaker name: {beaker_name}") + + # Step 3: Find latest checkpoint + print( + f"Searching for checkpoints with prefix: weka://oe-training-default/ai2-llm/checkpoints/{beaker_name}/{yaml_name}-" + ) + latest_checkpoint = find_latest_checkpoint(beaker_name, yaml_name) + print(f"Latest checkpoint: {latest_checkpoint}") + + # Step 4: Check if weka path already exists + if check_hf_path_exists(latest_checkpoint) and not args.overwrite: + print(f"\n🚫 Converted checkpoint already exists in weka storage. Skipping cookbook command.") + print(f"The checkpoint has already been copied to weka://oe-training-default/") + return + + # Step 5: Run olmo-cookbook command + run_olmo_cookbook(latest_checkpoint) + + +if __name__ == "__main__": + main() diff --git a/scripts/gs2weka.py b/scripts/gs2weka.py new file mode 100644 index 000000000..91cd23067 --- /dev/null +++ b/scripts/gs2weka.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Script to process YAML file and run olmo-cookbook command with latest checkpoint +""" +import argparse +import re +import subprocess +import sys +from pathlib import Path + +import yaml +from google.auth.exceptions import DefaultCredentialsError +from google.cloud import storage + + +def run_command(cmd, shell=False, errs_okay=False): + """Run a shell command and return stdout""" + try: + if shell: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) + else: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return result.stdout.strip() + except subprocess.CalledProcessError as e: + print(f"Error running command: {' '.join(cmd) if isinstance(cmd, list) else cmd}") + print(f"Error: {e.stderr}") + if not errs_okay: + sys.exit(1) + raise e + + +def get_yaml_name(yaml_file): + """Extract the 'name' attribute from YAML file""" + try: + with open(yaml_file, "r") as f: + data = yaml.safe_load(f) + + if "name" not in data: + print(f"Error: 'name' attribute not found in {yaml_file}") + sys.exit(1) + + return data["name"] + except Exception as e: + print(f"Error reading YAML file {yaml_file}: {e}") + sys.exit(1) + + +def get_beaker_name(): + """Get the NAME from 'beaker account whoami' output""" + output = run_command(["beaker", "account", "whoami"]) + + # Parse the table output to extract NAME + lines = output.strip().split("\n") + if len(lines) < 2: + print("Error: Unexpected output from 'beaker account whoami'") + sys.exit(1) + + # Look for the data row (skip header) + for line in lines[1:]: + parts = line.split() + if len(parts) >= 2: + return parts[1] # NAME is the second column + + print("Error: Could not extract NAME from beaker account whoami output") + sys.exit(1) + + +def find_latest_checkpoint(beaker_name, yaml_name): + """Find the latest checkpoint directory in GCS""" + # Construct the GCS path prefix (without gs://) + bucket_name = "ai2-llm" + prefix = f"checkpoints/{beaker_name}/{yaml_name}-" + + try: + # Initialize the GCS client + client = storage.Client(project="ai2-allennlp") + bucket = client.bucket(bucket_name) + + print(f"Searching for checkpoints with prefix: gs://{bucket_name}/{prefix}") + + # List all blobs with the prefix + blobs = bucket.list_blobs(prefix=prefix) + + # Find paths that match the pattern: prefix*/step*/ + checkpoint_paths = [] + for blob in blobs: + # Split the blob name into parts + parts = blob.name.split("/") + + # Check if this looks like a checkpoint directory structure + # We want: checkpoints/{beaker_name}/{yaml_name}-{something}/step{something}/ + if len(parts) >= 4: # At least checkpoints/beaker/yaml-*/step*/ + # Check if there's a step directory in the path + for i, part in enumerate(parts): + if part.startswith("step") and i < len(parts) - 1: + # Construct the directory path up to and including the step directory + step_dir_path = "/".join(parts[: i + 1]) + full_path = f"gs://{bucket_name}/{step_dir_path}" + + if full_path not in checkpoint_paths: + checkpoint_paths.append(full_path) + break + + if not checkpoint_paths: + print(f"No checkpoints found with prefix: gs://{bucket_name}/{prefix}") + sys.exit(1) + + # Sort paths to get the latest one (lexicographically) + checkpoint_paths.sort(reverse=True) + + print(f"Found {len(checkpoint_paths)} checkpoint directories") + print(f"Latest checkpoint: {checkpoint_paths[0]}") + + return checkpoint_paths[0] + + except DefaultCredentialsError: + print("Error: Google Cloud credentials not found.") + print("Please set up authentication:") + print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable:") + print(" export GOOGLE_APPLICATION_CREDENTIALS='/path/to/service-account-key.json'") + print("2. Or run: gcloud auth application-default login") + sys.exit(1) + + except Exception as e: + print(f"Error listing GCS directories: {e}") + print(f"Make sure you have access to gs://{bucket_name}/{prefix}* directories") + sys.exit(1) + + +# Alternative implementation that's more efficient for large buckets +def find_latest_checkpoint_optimized(beaker_name, yaml_name): + """ + More efficient version that uses prefix listing to avoid scanning all objects. + This works better when you have many files in the bucket. + """ + bucket_name = "ai2-llm" + prefix = f"checkpoints/{beaker_name}/{yaml_name}-" + + try: + client = storage.Client() + bucket = client.bucket(bucket_name) + + # Get all "directories" by using delimiter + # This is more efficient as it doesn't list individual files + blobs = bucket.list_blobs(prefix=prefix, delimiter="/") + + # Collect all run directories (yaml_name-*) + run_prefixes = [] + for page in blobs.pages: + run_prefixes.extend(page.prefixes) + + if not run_prefixes: + print(f"No run directories found with prefix: gs://{bucket_name}/{prefix}") + sys.exit(1) + + # For each run directory, find step directories + checkpoint_paths = [] + for run_prefix in run_prefixes: + step_blobs = bucket.list_blobs(prefix=run_prefix, delimiter="/") + + step_prefixes = [] + for page in step_blobs.pages: + step_prefixes.extend(page.prefixes) + + # Filter for step directories + for step_prefix in step_prefixes: + if "/step" in step_prefix: + full_path = f"gs://{bucket_name}/{step_prefix.rstrip('/')}" + checkpoint_paths.append(full_path) + + if not checkpoint_paths: + print(f"No step directories found in runs matching: gs://{bucket_name}/{prefix}") + sys.exit(1) + + # Sort paths to get the latest one + checkpoint_paths.sort(reverse=True) + + print(f"Found {len(checkpoint_paths)} checkpoint directories") + print(f"Latest checkpoint: {checkpoint_paths[0]}") + + return checkpoint_paths[0] + + except Exception as e: + print(f"Error listing GCS directories: {e}") + print(f"Make sure you have access to gs://{bucket_name}/{prefix}* directories") + sys.exit(1) + + +def check_weka_path_exists(gs_path): + """Check if the corresponding weka path already exists""" + # Convert gs:// path to weka:// path + if not gs_path.startswith("gs://"): + print(f"Error: Expected gs:// path, got: {gs_path}") + return False + + weka_path = gs_path.replace("gs://", "weka://oe-training-default/") + + # Convert weka:// path to s3:// path for s5cmd + s3_path = weka_path.replace("weka://oe-training-default/", "s3://oe-training-default/") + + # Add wildcard to check for any files in the directory + s3_path_wildcard = f"{s3_path}/*" + + print(f"Checking if weka path exists: {weka_path}") + print(f"Using s5cmd to check: {s3_path_wildcard}") + + cmd = [ + "s5cmd", + "--profile", + "WEKA", + "--endpoint-url", + "https://weka-aus.beaker.org:9000", + "ls", + s3_path_wildcard, + ] + + try: + # Run the command - if it succeeds, the path exists + output = run_command(cmd, errs_okay=True) + + print(f"āœ… Weka path exists - found %s files:" % len(output.split("\n"))) + # print(output) + return True + + except subprocess.CalledProcessError as e: + # If the command fails, the path doesn't exist + print(f"āŒ Weka path does not exist (s5cmd failed as expected)") + return False + except Exception as e: + print("ERR CODE", e) + raise e + + +def run_olmo_cookbook(gs_path): + """Run the olmo-cookbook command with the GCS path""" + weka_path = gs_path.replace("gs://", "weka://oe-training-default/") + + cmd = ["python", "-m", "cookbook.remote", gs_path, weka_path] + + print(f"Running: {' '.join(cmd)}") + + try: + # Run the command and stream output in real-time + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True + ) + + beaker_url = None + beaker_url_pattern = re.compile(r"https://beaker\.org/ex/[A-Z0-9]+") + + for line in process.stdout: + print(line, end="") + + # Look for the beaker URL in the output + match = beaker_url_pattern.search(line) + if match: + beaker_url = match.group(0) + + process.wait() + + if process.returncode != 0: + print(f"Error: olmo-cookbook command failed with return code {process.returncode}") + sys.exit(1) + + # Print the extracted Beaker URL + if beaker_url: + print(f"\n" + "=" * 60) + print(f"šŸ”— Beaker Experiment URL: {beaker_url}") + print(f"=" * 60) + return beaker_url + else: + print("\nWarning: Could not extract Beaker experiment URL from output") + + except Exception as e: + print(f"Error running olmo-cookbook: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser(description="Process YAML file and run olmo-cookbook with latest checkpoint") + parser.add_argument("yaml_file", help="Path to the YAML file") + parser.add_argument("--beaker-name", required=False, default=None) + args = parser.parse_args() + + # Validate input file exists + if not Path(args.yaml_file).exists(): + print(f"Error: YAML file {args.yaml_file} does not exist") + sys.exit(1) + + print(f"Processing YAML file: {args.yaml_file}") + + # Step 1: Get name from YAML + yaml_name = get_yaml_name(args.yaml_file) + print(f"YAML name: {yaml_name}") + + # Step 2: Get beaker name + if args.beaker_name == None: + beaker_name = get_beaker_name() + else: + beaker_name = args.beaker_name + print(f"Beaker name: {beaker_name}") + + # Step 3: Find latest checkpoint + print(f"Searching for checkpoints with prefix: gs://ai2-llm/checkpoints/{beaker_name}/{yaml_name}-") + latest_checkpoint = find_latest_checkpoint(beaker_name, yaml_name) + print(f"Latest checkpoint: {latest_checkpoint}") + + # Step 4: Check if weka path already exists + if check_weka_path_exists(latest_checkpoint): + print(f"\n🚫 Checkpoint already exists in weka storage. Skipping cookbook command.") + print(f"The checkpoint has already been copied to weka://oe-training-default/") + return + + # Step 5: Run olmo-cookbook command + run_olmo_cookbook(latest_checkpoint) + + +if __name__ == "__main__": + main() diff --git a/scripts/olmo3_midtrain_eval.sh b/scripts/olmo3_midtrain_eval.sh new file mode 100644 index 000000000..258b1764f --- /dev/null +++ b/scripts/olmo3_midtrain_eval.sh @@ -0,0 +1,48 @@ + +# Check if first argument is provided +if [ $# -eq 0 ]; then + echo "Usage: $0 " + echo "Example: $0 /oe-training-default/ai2-llm/checkpoints/mattj/microanneal-dolminos_math_baseline-1B-ffabe337/step477-hf" + exit 1 +fi + +# Store the first argument in a variable +CHECKPOINT_PATH="$1" +DASHBOARD="$2" + +olmo-cookbook-eval evaluate \ + "$CHECKPOINT_PATH" \ + --tasks olmo3:dev:midtrain:v1 \ + --priority high \ + --cluster aus80g \ + --num-gpus 1 \ + --partition-size 8 \ + --model-backend vllm \ + --no-compute-gold-bpb \ + --model-args chat_template=basic_answer,trust_remote_code=true,max_length=8192 \ + --use-gantry \ + --gantry-args env-secret="OPENAI_API_KEY=openai_api_key" \ + --task-args chat_overrides="{\"generation_kwargs\": {\"stop_sequences\": [\"Problem:\", \"Answer:\", \"Question:\", \"\", \"<|eot_id|>\"]}}" \ + --fim-tokens l2c \ + --oe-eval-branch davidh/olmo3 \ + --beaker-image oe-eval-beaker/oe_eval_olmo3_auto \ + --vllm-use-v1-spec \ + --dashboard $DASHBOARD \ + --workspace ai2/olmo-3-microanneals + +olmo-cookbook-eval evaluate \ + "$CHECKPOINT_PATH" \ + --tasks olmo3:dev:7b:main:v2 \ + --priority high \ + --cluster aus80g \ + --partition-size 8 \ + --num-gpus 1 \ + --model-backend vllm \ + --model-args trust_remote_code=true,max_length=4096 \ + --beaker-image oe-eval-beaker/oe_eval_olmo3_auto\ + --fim-tokens l2c \ + --vllm-use-v1-spec \ + --vllm-memory-utilization 0.7 \ + --dashboard $DASHBOARD \ + --workspace ai2/olmo-3-microanneals + diff --git a/src/cookbook/cli/eval.py b/src/cookbook/cli/eval.py index 3ab70c506..e02e37785 100644 --- a/src/cookbook/cli/eval.py +++ b/src/cookbook/cli/eval.py @@ -6,8 +6,8 @@ import click from rich.console import Console -from rich.table import Table from rich.pretty import pprint +from rich.table import Table from cookbook.cli.utils import ( get_aws_access_key_id, @@ -26,10 +26,10 @@ TRANSFORMERS_COMMIT_HASH, TRANSFORMERS_GIT_URL, ) -from cookbook.eval.named_tasks import BaseNamedTasksGroup, NamedTasksGroupRegistry from cookbook.eval.conversion import run_checkpoint_conversion from cookbook.eval.datalake import AddToDashboard, FindExperiments, RemoveFromDashboard from cookbook.eval.evaluation import evaluate_checkpoint +from cookbook.eval.named_tasks import BaseNamedTasksGroup, NamedTasksGroupRegistry from cookbook.eval.results import make_dashboard_table, print_missing_tasks logger = logging.getLogger(__name__) @@ -428,8 +428,8 @@ def evaluate_model( dashboard, model_name, tasks, - format='return_missing', - sort_by='avg', + format="return_missing", + sort_by="avg", sort_column_name=None, sort_descending=None, force=False, @@ -440,7 +440,7 @@ def evaluate_model( if model_name in missing_tasks: tasks = missing_tasks[model_name] else: - print(f'Found no missing tasks for {model_name}') + print(f"Found no missing tasks for {model_name}") return evaluate_checkpoint( @@ -545,7 +545,6 @@ def get_results( force: bool, skip_on_fail: bool, ) -> None: - # compile tasks names into regex patterns (if possible) compiled_tasks = [re.compile(task) if re.escape(task) != task else task for task in tasks] @@ -625,7 +624,7 @@ def get_results( columns_filter_tasks=columns_filter_tasks, ) - if format == 'return_missing': + if format == "return_missing": return missing_tasks # okay we got all results! now time to sort them depending on the user's request diff --git a/src/cookbook/data/dataset.py b/src/cookbook/data/dataset.py index 8ba291d96..ff86f5016 100644 --- a/src/cookbook/data/dataset.py +++ b/src/cookbook/data/dataset.py @@ -43,10 +43,16 @@ def build(self) -> SourceMixtureDatasetConfig: schemes = {urlparse(path).scheme for path in paths + globs} if len(schemes) > 1: raise ValueError(f"All paths for source {source.name} must have the same scheme. Found: {schemes}") + elif len(schemes) == 0: + raise ValueError(f"No paths found for source {source.name}") scheme = schemes.pop() expanded = paths + expand_globs(self.cached_fs.get(scheme, self.cached_fs["s3"]), globs) + + if len(expanded) == 0: + raise ValueError(f"No paths found for source {source.name}") + source_configs.append( SourceMixtureConfig( source_name=source.name, diff --git a/src/cookbook/eval/datalake.py b/src/cookbook/eval/datalake.py index cf4c80a0e..b9f19d0d6 100644 --- a/src/cookbook/eval/datalake.py +++ b/src/cookbook/eval/datalake.py @@ -148,7 +148,6 @@ class FindExperiments(BaseDatalakeItem): @classmethod def run(cls, dashboard: str | None = None, model_name: str | None = None, limit: int = 10_000) -> list[Self]: - # make sure at least one of dashboard or model_name is provided assert dashboard or model_name, "Either dashboard or model_name must be provided" response = requests.get( @@ -239,9 +238,8 @@ def model_path(self) -> str | None: @property def model_name(self) -> str | None: model_name = self.model_config.get("model", None) - if 'revision' in self.model_config and \ - self.model_config['revision'] is not None: - model_name = model_name + ':' + self.model_config['revision'] + if "revision" in self.model_config and self.model_config["revision"] is not None: + model_name = model_name + ":" + self.model_config["revision"] return model_name @property @@ -304,7 +302,6 @@ def run(cls, model_name: str, dashboard: str, fuzzy: bool = False) -> List[Self] fns = [] for run in runs: - # if the experiment is in the cache, we remove it since we changed its tags cache.delete(experiment_id=run.experiment_id) diff --git a/src/cookbook/eval/evaluation.py b/src/cookbook/eval/evaluation.py index fbdef5110..88e91862d 100644 --- a/src/cookbook/eval/evaluation.py +++ b/src/cookbook/eval/evaluation.py @@ -1,13 +1,17 @@ import json +import os import re import shlex import subprocess +import sys +from collections.abc import Mapping from copy import deepcopy from hashlib import md5 -import sys from typing import Optional from urllib.parse import urlparse +from rich.pretty import pprint + from cookbook.cli.utils import ( PythonEnv, add_aws_flags, @@ -15,13 +19,9 @@ install_oe_eval, make_eval_run_name, ) -from cookbook.constants import ( - BEAKER_KNOWN_CLUSTERS, - FIM_TOKENS, - OE_EVAL_LAUNCH_COMMAND, - WEKA_MOUNTS, -) +from cookbook.constants import BEAKER_KNOWN_CLUSTERS, FIM_TOKENS, OE_EVAL_LAUNCH_COMMAND, WEKA_MOUNTS from cookbook.eval.named_tasks import NamedTasksGroupRegistry +from cookbook.eval.results import make_dashboard_table def evaluate_checkpoint( @@ -184,29 +184,23 @@ def evaluate_checkpoint( # these are all the tasks we want to run; note that we can't run regex patterns here, # they have to be actual strings - all_tasks_set = set() - for task_group in tasks: - try: - # this is a task group! the get function will return a class that has an expanded_tasks attribute - all_tasks_set.update(NamedTasksGroupRegistry.get(task_group).expanded_tasks) - except ValueError: - # actually not a task group, just a task name. append as is. - all_tasks_set.add(task_group) - - # we finish by sorting the tasks - all_tasks = sorted(all_tasks_set) - - # @davidh: we have a few specific tasks that are not implemented in oe-eval as standalone tasks - # @soldni: to clarify: this is fine, since these tasks are computed anyway as part of the non-bpb version, - # it's just the task alias that does not exist. - EXCLUDE_FROM_LAUNCH = [ - r'^mmlu_.*:bpb::olmes$', - r'^lambada:bpb$' - ] - all_tasks = [ - task for task in all_tasks - if not any(re.match(pattern, task) for pattern in EXCLUDE_FROM_LAUNCH) - ] + all_tasks = sorted( + list( + set( + task + for task_group in tasks + for task in NamedTasksGroupRegistry.get(task_group).expanded_tasks + if isinstance(task, str) + ) + ) + ) + + print("Launching evals on the following tasks:") + pprint(all_tasks) + + # @davidh we have a few specific tasks that are not implemented in oe-eval as standalone tasks + EXCLUDE_FROM_LAUNCH = [r"^mmlu_.*:bpb::olmes$", r"^lambada:bpb$"] + all_tasks = [task for task in all_tasks if not any(re.match(pattern, task) for pattern in EXCLUDE_FROM_LAUNCH)] # DOING SOME PRETTY PRINTING HERE # print( @@ -323,8 +317,11 @@ def evaluate_checkpoint( if "stop_sequences" in partition_task_args["generation_kwargs"]: # Add the stop tokens if they do not exist partition_task_args["generation_kwargs"]["stop_sequences"].extend( - [stop_tok for stop_tok in infilling_dict["generation_kwargs"]["stop_sequences"] - if stop_tok not in partition_task_args["generation_kwargs"]["stop_sequences"]] + [ + stop_tok + for stop_tok in infilling_dict["generation_kwargs"]["stop_sequences"] + if stop_tok not in partition_task_args["generation_kwargs"]["stop_sequences"] + ] ) else: partition_task_args["generation_kwargs"].update(infilling_dict["generation_kwargs"]) diff --git a/src/cookbook/model/builder.py b/src/cookbook/model/builder.py index d762c1c5d..bc08e50d9 100644 --- a/src/cookbook/model/builder.py +++ b/src/cookbook/model/builder.py @@ -16,19 +16,24 @@ NumpyDatasetType, TokenizerConfig, ) +from olmo_core.nn.attention import SlidingWindowAttentionConfig from olmo_core.data.types import NumpyDatasetDType from olmo_core.distributed.parallel import DataParallelType -from olmo_core.float8 import Float8Config +from olmo_core.float8 import AOFloat8LinearConfig, Float8Config from olmo_core.io import resource_path from olmo_core.nn.transformer import TransformerConfig from olmo_core.optim import ( - CosWithWarmup, OptimConfig, OptimGroupOverride, - Scheduler, SkipStepAdamWConfig, ) -from olmo_core.optim.scheduler import CosWithWarmupAndLinearDecay, LinearWithWarmup +from olmo_core.optim.scheduler import ( + WSD, + CosWithWarmup, + CosWithWarmupAndLinearDecay, + LinearWithWarmup, + Scheduler, +) from olmo_core.train import Duration, TrainerConfig from olmo_core.train.callbacks import ( BeakerCallback, @@ -66,7 +71,6 @@ WrappedTransformerConfig, ) from cookbook.model.evaluators import DownstreamEvaluator, get_tasks_for_groups -from cookbook.model.schedulers import WSD logger = logging.getLogger(__name__) @@ -195,6 +199,7 @@ class TransformerConfigBuilder: activation_checkpointing: bool annealing: Optional[AnnealConfig] = None profile: bool = False + shard_degree: Optional[int] = None def __init__( self, @@ -215,6 +220,7 @@ def __init__( lm_evaluator: bool, downstream_evaluators: List[DownstreamEvaluator], # type: ignore scheduler_type: SchedulerType, + shard_degree: Optional[int] = None, activation_checkpointing: bool = False, model_overrides: Optional[List[str]] = None, load_path_fs: Optional[Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem]] = None, @@ -229,6 +235,7 @@ def __init__( seed: int = 42, warmup_steps: Optional[int] = None, profile: bool = False, + float8_enabled: bool = True, ): self.run_name = run_name self.sources = sources @@ -266,6 +273,9 @@ def __init__( self.checkpoint_dir = f"{self.data_dir}/checkpoints/{self.beaker_user.lower()}/{self.run_name}" self.eval_interval = eval_interval self.cluster = cluster + self.float8_enabled = float8_enabled + self.cancel_check_interval = 50 + self.shard_degree = shard_degree if any(substring in cluster for substring in ["augusta"]): self.root_dir = "gs://ai2-llm" @@ -338,8 +348,8 @@ def build_callbacks(self) -> Dict[str, Callback]: callbacks = { "checkpointer": CheckpointerCallback( save_interval=self.save_interval, - ephemeral_save_interval=100, - save_async=True, + ephemeral_save_interval=None, + save_async=False, ), "config_saver": ConfigSaverCallback(), "profiler": ProfilerCallback(enabled=self.profile), @@ -368,7 +378,7 @@ def build_callbacks(self) -> Dict[str, Callback]: project=self.metrics_config.project.strip(), entity=self.metrics_config.entity.strip(), group=self.group_id.strip(), - cancel_check_interval=10, + cancel_check_interval=self.cancel_check_interval, enabled=True, ) if MetricBackend.comet in self.metrics_config.backends: @@ -385,7 +395,7 @@ def build_callbacks(self) -> Dict[str, Callback]: workspace=self.metrics_config.workspace.strip(), project=self.metrics_config.project.strip(), enabled=True, - cancel_check_interval=10, + cancel_check_interval=self.cancel_check_interval, ) if self.lm_evaluator: @@ -412,7 +422,7 @@ def build_callbacks(self) -> Dict[str, Callback]: return callbacks - def build_dataset_config(self, loader_processes: int = 16) -> NumpyDatasetConfig: + def build_dataset_config(self, loader_processes: int = 8) -> NumpyDatasetConfig: is_fractional = any(source.ratio is not None and source.ratio != 1 for source in self.sources) mixture_config = None @@ -475,14 +485,14 @@ def get_optimizer_config(self) -> OptimConfig: lr=lr, weight_decay=0.033, betas=(0.9, 0.95), + foreach=True, group_overrides=[OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0))], ) def get_ac_config(self): - # NOTE: This is pretty broad, we can make this more fine-grained if we find it useful return TransformerActivationCheckpointingConfig( mode=TransformerActivationCheckpointingMode.selected_modules, - modules=["blocks.*.feed_forward"], + modules=[f"blocks.{i}.feed_forward" for i in range(0, 64, 4)], ) def load_state_and_config_from_path(self) -> Tuple[Path, Path]: @@ -511,6 +521,16 @@ def load_state_and_config_from_path(self) -> Tuple[Path, Path]: resource_path(folder=self.load_path, fname="config.json"), ) + def get_fp8_config(self) -> Float8Config: + return Float8Config( + enabled=self.float8_enabled, + ao=AOFloat8LinearConfig( + enable_fsdp_float8_all_gather=True, + force_recompute_fp8_weight_in_bwd=True, + round_scales_to_power_of_2=True, + ), + ) + def get_state_from_checkpoint(self) -> SchedulerState: state_path, config_path = self.load_state_and_config_from_path() train_state = torch.load(state_path, weights_only=False) @@ -539,30 +559,47 @@ def get_state_from_checkpoint(self) -> SchedulerState: try: # Try olmo_core v2 config format first - base_lr: int = config["optim"]["lr"] + base_lr: int = config["train_module"]["optim"]["lr"] scheduler_config = config["train_module"]["scheduler"] - except KeyError as e: + except KeyError: # Now try olmo_core v1 config format try: base_lr: int = config["optim"]["lr"] scheduler_config = config["trainer"]["callbacks"]["lr_scheduler"]["scheduler"] - except KeyError as e: + except Exception as e: logger.error( - "Could not find base_lr or scheduler config in train state. Please ensure the checkpoint is valid. Unable to load scheduler state." + "Could not find base_lr or scheduler config in train state. Please ensure the checkpoint is valid. Unable to load scheduler state.", + e, ) raise e scheduler_class = scheduler_config.pop("_CLASS_").split(".")[-1] try: - assert scheduler_class == CosWithWarmup.__name__ + assert scheduler_class == CosWithWarmup.__name__ or scheduler_class == WSD.__name__ except AssertionError as e: logger.error( - f"Expected scheduler class {CosWithWarmup.__name__}, but got {scheduler_class}: Anneals from a base LR can only be inferred from CosWithWarmup scheduler." + f"Expected scheduler class {CosWithWarmup.__name__} or {WSD.__name__}, but got {scheduler_class}: Anneals from a base LR cannot be inferred from this scheduler type. Exiting!" ) raise e - scheduler = CosWithWarmup(**scheduler_config) + try: + if scheduler_class == WSD.__name__: + if not scheduler_config.get("decay_fraction", None): + scheduler_config["decay_fraction"] = None + scheduler = WSD(**scheduler_config) + elif scheduler_class == CosWithWarmup.__name__: + scheduler = CosWithWarmup(**scheduler_config) + else: + raise ValueError(f"Unsupported scheduler class: {scheduler_class}") + except Exception as e: + logger.error( + "Could not instantiate scheduler from config. Please ensure the checkpoint is valid. Unable to load scheduler state.", + e, + ) + logger.info(scheduler_config) + raise e + starting_lr = float(scheduler.get_lr(base_lr, last_pretrain_step, max_pretrain_steps)) return SchedulerState( @@ -596,10 +633,14 @@ def build(self) -> ModelTrainConfig: optim=self.get_optimizer_config(), compile_model=True, dp_config=train_module.TransformerDataParallelConfig( - name=DataParallelType.hsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32 + name=DataParallelType.hsdp, + param_dtype=DType.bfloat16, + reduce_dtype=DType.float32, + wrapping_strategy=train_module.TransformerDataParallelWrappingStrategy.blocks, + shard_degree=self.shard_degree, ), ac_config=self.get_ac_config() if self.activation_checkpointing else None, - float8_config=Float8Config(enabled=False), + float8_config=self.get_fp8_config(), z_loss_multiplier=1e-5, max_grad_norm=1.0, scheduler=self.get_scheduler_config(), @@ -613,7 +654,7 @@ def build(self) -> ModelTrainConfig: work_dir=self.dataset_cache, save_overwrite=True, metrics_collect_interval=10, - cancel_check_interval=5, + cancel_check_interval=self.cancel_check_interval, max_duration=Duration.tokens(self.max_tokens), ) @@ -627,6 +668,16 @@ def build(self) -> ModelTrainConfig: self.transformer_config = self.transformer_config.merge(dotlist=self.model_overrides) + # TODO(undfined): The hax once swafix is not an issue anymore + if self.model_identifier == "olmo2_7B_swafix": + self.transformer_config.block.attention.sliding_window = SlidingWindowAttentionConfig( + force_full_attention_on_first_layer=False, + force_full_attention_on_last_layer=True, + pattern=[4096, 4096, 4096, -1], + ) + self.transformer_config.block.attention.use_flash = True + self.transformer_config.block.attention.use_head_qk_norm = True + return ModelTrainConfig( init_seed=self.seed, model=self.transformer_config, diff --git a/src/cookbook/model/config.py b/src/cookbook/model/config.py index 373c04c75..457ab2841 100644 --- a/src/cookbook/model/config.py +++ b/src/cookbook/model/config.py @@ -1,6 +1,5 @@ from dataclasses import dataclass from enum import Enum -from typing import Any, Optional import olmo_core.train.train_module as train_module from olmo_core.config import Config @@ -187,6 +186,18 @@ def olmo_30m(cls, tokenizer: TokenizerConfig) -> TransformerConfig: block_name=DefaultTransformerProperties.block_type, ) + @classmethod + def olmo2_7B_swafix(cls, tokenizer: TokenizerConfig) -> TransformerConfig: + """ + OLMo2 7B with SWA fix changes + """ + return getattr(TransformerConfig, "olmo2_7B")( + vocab_size=tokenizer.padded_vocab_size(), + n_kv_heads=8, + hidden_size_multiplier=1.2, + hidden_size_multiple_of=1024, + ) + @classmethod def from_model_identifier( cls, diff --git a/src/cookbook/model/schedulers.py b/src/cookbook/model/schedulers.py deleted file mode 100644 index 09d03bcd2..000000000 --- a/src/cookbook/model/schedulers.py +++ /dev/null @@ -1,71 +0,0 @@ -from dataclasses import dataclass -from typing import Optional, Union - -import torch -from olmo_core.exceptions import OLMoConfigurationError -from olmo_core.optim import Scheduler - - -@dataclass -# NOTE: Temporary port from https://github.com/allenai/OLMo-core/blob/dirkg/DenseExperiments/src/olmo_core/optim/scheduler.py#L67 for debugging -class WSD(Scheduler): - """ - Warmup-stable-decay scheduler - """ - - warmup_steps: Optional[int] = 2000 - warmup_fraction: Optional[float] = None - decay_steps: Optional[int] = None - decay_fraction: Optional[float] = 0.1 - warmup_min_lr: float = 0.0 - decay_min_lr: float = 0.0 - - def __post_init__(self): - if (self.warmup_fraction is None) == (self.warmup_steps is None): - raise OLMoConfigurationError("Either warmup_fraction or warmup_steps must be specified.") - if self.warmup_fraction is not None and (self.warmup_fraction < 0 or self.warmup_fraction > 1): - raise OLMoConfigurationError("warmup_fraction must be between 0 and 1.") - - if (self.decay_fraction is None) == (self.decay_steps is None): - raise OLMoConfigurationError("Either decay_fraction or decay_steps must be specified.") - if self.decay_fraction is not None and (self.decay_fraction < 0 or self.decay_fraction > 1): - raise OLMoConfigurationError("decay_fraction must be between 0 and 1.") - - def get_lr( - self, initial_lr: Union[float, torch.Tensor], step: int, max_steps: int - ) -> Union[float, torch.Tensor]: - if self.warmup_steps is None: - warmup_steps = round(max_steps * self.warmup_fraction) if self.warmup_fraction is not None else 0 - else: - warmup_steps = self.warmup_steps - - if step <= warmup_steps: - return _linear_warmup(initial_lr, step, warmup_steps, self.warmup_min_lr) - - if self.decay_steps is None: - decay_steps = round(max_steps * self.decay_fraction) if self.decay_fraction is not None else 0 - else: - decay_steps = self.decay_steps - - if step >= max_steps - decay_steps: - return _linear_decay(initial_lr, max_steps - step, decay_steps, self.decay_min_lr) - - del step, max_steps - return initial_lr - - -def _linear_warmup( - initial_lr: Union[float, torch.Tensor], step: int, warmup_steps: int, warmup_min_lr: float = 0.0 -) -> Union[float, torch.Tensor]: - if isinstance(initial_lr, float): # not worth the potential host-device sync if it's a tensor - assert 0 <= warmup_min_lr < initial_lr - return warmup_min_lr + (initial_lr - warmup_min_lr) * min(step, warmup_steps) / warmup_steps - - -def _linear_decay( - initial_lr: Union[float, torch.Tensor], step_from_end: int, decay_steps: int, decay_min_lr: float = 0.0 -) -> Union[float, torch.Tensor]: - if isinstance(initial_lr, float): # not worth the potential host-device sync if it's a tensor - assert 0 <= decay_min_lr < initial_lr - - return decay_min_lr + (initial_lr - decay_min_lr) * min(step_from_end, decay_steps) / decay_steps diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo25_7b-with-reasoning-10B-anneal-2T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo25_7b-with-reasoning-10B-anneal-2T.yaml new file mode 100644 index 000000000..7948ae884 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo25_7b-with-reasoning-10B-anneal-2T.yaml @@ -0,0 +1,68 @@ +name: "anneal-round1-10B-olmo25_7b_with-reasoning-anneal-2T" +description: "OLMo2.5 7b anneal to 10B Tokens on Round 1 midtraining mix WITH reasoning data from 2T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo25_7b" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true + initial_lr: 0.00020712352850360292 +load_path: gs://ai2-llm/checkpoints/OLMo25/step476838 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo3_7b-with-reasoning-10B-anneal-4T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo3_7b-with-reasoning-10B-anneal-4T.yaml new file mode 100644 index 000000000..4b44e2d6b --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round1-olmo3_7b-with-reasoning-10B-anneal-4T.yaml @@ -0,0 +1,67 @@ +name: "anneal-round1-10B-olmo3_7b_with-reasoning-anneal-4T" +description: "OLMo3 7b anneal to 10B Tokens on Round 1 midtraining mix WITH reasoning data from 4T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo25_7b-10B-anneal-2T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo25_7b-10B-anneal-2T.yaml new file mode 100644 index 000000000..3eedcacc3 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo25_7b-10B-anneal-2T.yaml @@ -0,0 +1,86 @@ +name: "anneal-round3-10B-olmo25_7b-anneal-2T" +description: "OLMo2.5 7b anneal to 10B Tokens on Round 3 midtraining mix from 2T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo25_7b" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true + initial_lr: 0.00020712352850360292 +load_path: gs://ai2-llm/checkpoints/OLMo25/step476838 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.40 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: megamath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo3_7b-10B-anneal-4T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo3_7b-10B-anneal-4T.yaml new file mode 100644 index 000000000..35fe83abd --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round3-webround2-olmo3_7b-10B-anneal-4T.yaml @@ -0,0 +1,85 @@ +name: "anneal-round3-10B-olmo3_7b-anneal-4T" +description: "OLMo3 7b anneal to 10B Tokens on Round 3 midtraining mix from 4T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.40 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: megamath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo25_7b-10B-anneal-decon-2T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo25_7b-10B-anneal-decon-2T.yaml new file mode 100644 index 000000000..a0b048432 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo25_7b-10B-anneal-decon-2T.yaml @@ -0,0 +1,355 @@ +name: "anneal-round5-10B-olmo25_7b-anneal-2T" +description: "OLMo2.5 7b anneal to 10B Tokens on Round 5 midtraining mix from 2T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo25_7b" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true + initial_lr: 0.00020712352850360292 +load_path: gs://ai2-llm/checkpoints/OLMo25/step476838 +load_state: false +dataset: + sources: + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy + - name: code_fim + target_ratio: 0.1 + paths: + # 21,390,279,634 + - s3://ai2-llm/preprocessed/stack-edu/sample-fim-weighted-pl-edu-score-decon/**/**/*.npy + - name: swallowcode + target_ratio: 0.1 + paths: + # 18,833,636,683 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data-decon/allenai/dolma2-tokenizer/*.npy + - name: megamatt + # 20% less the ratio for dolminos2math + target_ratio: 0.01698 + paths: + # 3,883,674,937 + - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.18302 + paths: + # 5_624_449_531 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy + # 10,687,987,907 + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy + # 1.99B + # 850,848,999 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + # 240,590,380 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.059 + paths: + # 9,860,465,314 + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 21,390,279,634 + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: rcqa + target_ratio: 0.03 + paths: + # 4,215,210,848 + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + - name: nemotron-synth-qa + target_ratio: .05 + paths: + # 486,558,362,887 + - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy + # - name: instruction + # target_ratio: 0.0 + # paths: + # # 1,627,593,038 + # - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy + - name: instruction-new-format + target_ratio: 0.011 + paths: + # 1,639,399,859 + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy + - name: flan + target_ratio: .05 + paths: + # 17,055,708,123 + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + # 2,483,453,165 + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + # 4,774,150,082 + - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + # 254,415,258 + - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: llamanemotron_reasoning + target_ratio: .0125 + paths: + # 20.9B + - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy + - name: openthoughts2 + target_ratio: .0125 + paths: + # 5,601,836,260 + - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + # ORIGINALS + # 1,198,073,462 --> actually code meta reasoning + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 366,757,554 --> has special tokens + # - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 1,198,074,765 --> different version of code meta reasoning + # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + #### + # FIXES + # 1,049,524,455 --> actual math meta reasoning + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-fixed-decon/allenai/dolma2-tokenizer/*.npy + # 364,483,656 --> verifiable/gpt-41 without special tokens + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/cleaned-documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 73,752,560 --> verifiable/gpt-o4-mini without special tokens + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/o4-mini-high-decon/allenai/dolma2-tokenizer/*.npy + #### + # NEW FORMAT + # 1,057,302,754 --> actual math meta-reasoning with latex format + # - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy + # 1,199,242,351 --> actual code meta-reasoning with latex format + # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy + + # PDF 0.05 + - name: pdf-quality-art_design + target_ratio: 0.000035 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-crime_law + target_ratio: 0.000775 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-education_jobs + target_ratio: 0.00258 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-entertainment + target_ratio: 0.00011 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-finance_business + target_ratio: 0.0007 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-hardware + target_ratio: 0.00024 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-health + target_ratio: 0.005935 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-history + target_ratio: 0.00027 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-home_hobbies + target_ratio: 0.000035 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-industrial + target_ratio: 0.00269 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-literature + target_ratio: 0.00045 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-politics + target_ratio: 0.00092 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-religion + target_ratio: 0.00026 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-science_tech + target_ratio: 0.02976 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software + target_ratio: 0.00059 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software_dev + target_ratio: 0.00435 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-sports_fitness + target_ratio: 0.00019 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-transportation + target_ratio: 0.00011 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy + + # Web 0.225 + - name: web-all_dressed-snazzy2-v18-v20--adult_content + target_ratio: 0.0002556935 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--art_and_design + target_ratio: 0.0028240385 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--crime_and_law + target_ratio: 0.0065884955 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs + target_ratio: 0.0096539490 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware + target_ratio: 0.0077935890 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--entertainment + target_ratio: 0.0216076590 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty + target_ratio: 0.0000286700 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--finance_and_business + target_ratio: 0.0091422550 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--food_and_dining + target_ratio: 0.0031632340 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--games + target_ratio: 0.0154939065 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--health + target_ratio: 0.0224049655 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--history_and_geography + target_ratio: 0.0061185640 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies + target_ratio: 0.0020584890 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--industrial + target_ratio: 0.0035336085 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--literature + target_ratio: 0.0153408140 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--politics + target_ratio: 0.0048655335 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--religion + target_ratio: 0.0044465105 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology + target_ratio: 0.0473059895 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--social_life + target_ratio: 0.0010352740 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software_development + target_ratio: 0.0106495710 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software + target_ratio: 0.0250799635 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness + target_ratio: 0.0029238175 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--transportation + target_ratio: 0.0021929705 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism + target_ratio: 0.0004924385 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo3_7b-10B-anneal-decon-4T.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo3_7b-10B-anneal-decon-4T.yaml new file mode 100644 index 000000000..d4fbc9f75 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/anneal-round5-olmo3_7b-10B-anneal-decon-4T.yaml @@ -0,0 +1,354 @@ +name: "anneal-round5-10B-olmo3_7b-anneal-4T" +description: "OLMo3 7b anneal to 10B Tokens on Round 5 midtraining mix from 4T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy + - name: code_fim + target_ratio: 0.1 + paths: + # 21,390,279,634 + - s3://ai2-llm/preprocessed/stack-edu/sample-fim-weighted-pl-edu-score-decon/**/**/*.npy + - name: swallowcode + target_ratio: 0.1 + paths: + # 18,833,636,683 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data-decon/allenai/dolma2-tokenizer/*.npy + - name: megamatt + # 20% less the ratio for dolminos2math + target_ratio: 0.01698 + paths: + # 3,883,674,937 + - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.18302 + paths: + # 5_624_449_531 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy + # 10,687,987,907 + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy + # 1.99B + # 850,848,999 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + # 240,590,380 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.059 + paths: + # 9,860,465,314 + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 21,390,279,634 + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: rcqa + target_ratio: 0.03 + paths: + # 4,215,210,848 + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + - name: nemotron-synth-qa + target_ratio: .05 + paths: + # 486,558,362,887 + - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy + # - name: instruction + # target_ratio: 0.0 + # paths: + # # 1,627,593,038 + # - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy + - name: instruction-new-format + target_ratio: 0.011 + paths: + # 1,639,399,859 + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy + - name: flan + target_ratio: .05 + paths: + # 17,055,708,123 + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + # 2,483,453,165 + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + # 4,774,150,082 + - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + # 254,415,258 + - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: llamanemotron_reasoning + target_ratio: .0125 + paths: + # 20.9B + - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy + - name: openthoughts2 + target_ratio: .0125 + paths: + # 5,601,836,260 + - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + # ORIGINALS + # 1,198,073,462 --> actually code meta reasoning + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 366,757,554 --> has special tokens + # - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 1,198,074,765 --> different version of code meta reasoning + # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + #### + # FIXES + # 1,049,524,455 --> actual math meta reasoning + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-fixed-decon/allenai/dolma2-tokenizer/*.npy + # 364,483,656 --> verifiable/gpt-41 without special tokens + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/cleaned-documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 73,752,560 --> verifiable/gpt-o4-mini without special tokens + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/o4-mini-high-decon/allenai/dolma2-tokenizer/*.npy + #### + # NEW FORMAT + # 1,057,302,754 --> actual math meta-reasoning with latex format + # - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy + # 1,199,242,351 --> actual code meta-reasoning with latex format + # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy + + # PDF 0.05 + - name: pdf-quality-art_design + target_ratio: 0.000035 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-crime_law + target_ratio: 0.000775 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-education_jobs + target_ratio: 0.00258 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-entertainment + target_ratio: 0.00011 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-finance_business + target_ratio: 0.0007 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-hardware + target_ratio: 0.00024 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-health + target_ratio: 0.005935 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-history + target_ratio: 0.00027 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-home_hobbies + target_ratio: 0.000035 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-industrial + target_ratio: 0.00269 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-literature + target_ratio: 0.00045 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-politics + target_ratio: 0.00092 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-religion + target_ratio: 0.00026 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-science_tech + target_ratio: 0.02976 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software + target_ratio: 0.00059 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software_dev + target_ratio: 0.00435 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-sports_fitness + target_ratio: 0.00019 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-transportation + target_ratio: 0.00011 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy + + # Web 0.225 + - name: web-all_dressed-snazzy2-v18-v20--adult_content + target_ratio: 0.0002556935 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--art_and_design + target_ratio: 0.0028240385 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--crime_and_law + target_ratio: 0.0065884955 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs + target_ratio: 0.0096539490 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware + target_ratio: 0.0077935890 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--entertainment + target_ratio: 0.0216076590 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty + target_ratio: 0.0000286700 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--finance_and_business + target_ratio: 0.0091422550 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--food_and_dining + target_ratio: 0.0031632340 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--games + target_ratio: 0.0154939065 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--health + target_ratio: 0.0224049655 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--history_and_geography + target_ratio: 0.0061185640 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies + target_ratio: 0.0020584890 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--industrial + target_ratio: 0.0035336085 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--literature + target_ratio: 0.0153408140 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--politics + target_ratio: 0.0048655335 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--religion + target_ratio: 0.0044465105 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology + target_ratio: 0.0473059895 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--social_life + target_ratio: 0.0010352740 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software_development + target_ratio: 0.0106495710 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software + target_ratio: 0.0250799635 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness + target_ratio: 0.0029238175 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--transportation + target_ratio: 0.0021929705 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism + target_ratio: 0.0004924385 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/fae-olmo3_7b-llamma-nemotron-reasonig-5b-sub8k.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/fae-olmo3_7b-llamma-nemotron-reasonig-5b-sub8k.yaml new file mode 100644 index 000000000..53af198e0 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/fae-olmo3_7b-llamma-nemotron-reasonig-5b-sub8k.yaml @@ -0,0 +1,41 @@ +name: "math-code-with-llamma-nemotron-reasoning-olmo3-microanneal-5b-sub8k" +description: "OLMo3 7b 5B web, math and code with llamma nemotron microanneal -- under 8k length" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 5_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.4 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.3 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: bigreasoningtraces + target_ratio: 0.1 + paths: + - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-sub8k-flat/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-anneal-round5-gen-mc-ratios-10B-olmo3_7b-12T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-anneal-round5-gen-mc-ratios-10B-olmo3_7b-12T-microanneal.yaml new file mode 100644 index 000000000..b2d99f7fb --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-anneal-round5-gen-mc-ratios-10B-olmo3_7b-12T-microanneal.yaml @@ -0,0 +1,260 @@ +name: "mixing-gen-mc-round5-ratios-10B-olmo3-microanneal" +description: "OLMo3 7b 10B mixing gen/MC validation with round 5 ratios" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: sponge + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.09523809523809523 + - name: reddit + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.11238095238095237 + - name: rcqa + paths: + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.057142857142857134 + - name: nemotron-synth-qa + paths: + - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.09523809523809523 + - name: instruction-new-format + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.02095238095238095 + - name: flan + paths: + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.09523809523809523 + - name: pdf-quality-art_design + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy + target_ratio: 6.666666666666666e-05 + - name: pdf-quality-crime_law + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.001476190476190476 + - name: pdf-quality-education_jobs + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.004914285714285714 + - name: pdf-quality-entertainment + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0002095238095238095 + - name: pdf-quality-finance_business + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0013333333333333333 + - name: pdf-quality-hardware + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.00045714285714285713 + - name: pdf-quality-health + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.011304761904761904 + - name: pdf-quality-history + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0005142857142857143 + - name: pdf-quality-home_hobbies + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy + target_ratio: 6.666666666666666e-05 + - name: pdf-quality-industrial + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0051238095238095235 + - name: pdf-quality-literature + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0008571428571428571 + - name: pdf-quality-politics + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0017523809523809523 + - name: pdf-quality-religion + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0004952380952380951 + - name: pdf-quality-science_tech + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.056685714285714284 + - name: pdf-quality-software + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0011238095238095239 + - name: pdf-quality-software_dev + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.008285714285714285 + - name: pdf-quality-sports_fitness + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0003619047619047619 + - name: pdf-quality-transportation + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0002095238095238095 + - name: web-all_dressed-snazzy2-v18-v20--adult_content + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.00048703523809523805 + - name: web-all_dressed-snazzy2-v18-v20--art_and_design + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.005379120952380952 + - name: web-all_dressed-snazzy2-v18-v20--crime_and_law + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.012549515238095239 + - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.018388474285714284 + - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.014844931428571427 + - name: web-all_dressed-snazzy2-v18-v20--entertainment + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.041157445714285715 + - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + target_ratio: 5.460952380952381e-05 + - name: web-all_dressed-snazzy2-v18-v20--finance_and_business + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.017413819047619048 + - name: web-all_dressed-snazzy2-v18-v20--food_and_dining + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.006025207619047619 + - name: web-all_dressed-snazzy2-v18-v20--games + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.029512202857142854 + - name: web-all_dressed-snazzy2-v18-v20--health + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.04267612476190476 + - name: web-all_dressed-snazzy2-v18-v20--history_and_geography + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.011654407619047619 + - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.003920931428571428 + - name: web-all_dressed-snazzy2-v18-v20--industrial + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.006730682857142857 + - name: web-all_dressed-snazzy2-v18-v20--literature + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.029220598095238092 + - name: web-all_dressed-snazzy2-v18-v20--politics + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.009267682857142856 + - name: web-all_dressed-snazzy2-v18-v20--religion + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.008469543809523809 + - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.09010664666666666 + - name: web-all_dressed-snazzy2-v18-v20--social_life + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0019719504761904765 + - name: web-all_dressed-snazzy2-v18-v20--software_development + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.020284897142857142 + - name: web-all_dressed-snazzy2-v18-v20--software + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.04777135904761905 + - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.005569176190476191 + - name: web-all_dressed-snazzy2-v18-v20--transportation + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.004177086666666666 + - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0009379780952380953 \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-nat-dist-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-nat-dist-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml new file mode 100644 index 000000000..1edca9daf --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-nat-dist-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml @@ -0,0 +1,276 @@ +name: "mixing-gen-mc-nat-dist-ratios-10B-olmo3-microanneal" +description: "OLMo3 7b 10B mixing gen/MC validation with natural distribution ratios" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: sponge + # 5,220,997,516 + target_ratio: 0.002492064670205875 + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/*/allenai/dolma2-tokenizer/*.npy + - name: reddit-high + # 9,860,465,314 + target_ratio: 0.004706556010705806 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: reddit-low + # 11,789,114,475 + target_ratio: 0.005627130751571149 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: rcqa + # 4,215,210,848 + target_ratio: 0.002011986789799757 + paths: + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + - name: nemotron-synth-qa + # 486,558,362,887 + target_ratio: 0.2322420001978607 + paths: + - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy + - name: instruction-new-format + # 1,639,399,859 + target_ratio: 0.0007825114753328669 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy + - name: flan + # 17,055,708,123 + target_ratio: 0.008140959176558946 + paths: + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy + + #PDF-Web 0.7439967909279649 + # 95,853,782,702 + # 144,337,465,243 + # 737,026,615,245 + # 581,491,845,987 + - name: pdf-quality-art_design + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy + target_ratio: 7.891913084101734e-05 + - name: pdf-quality-crime_law + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.001747495040051098 + - name: pdf-quality-education_jobs + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.005817467359137849 + - name: pdf-quality-entertainment + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.00024803155407176884 + - name: pdf-quality-finance_business + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0015783826168203467 + - name: pdf-quality-hardware + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0005411597543384047 + - name: pdf-quality-health + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.013382429758326798 + - name: pdf-quality-history + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0006088047236307052 + - name: pdf-quality-home_hobbies + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy + target_ratio: 7.891913084101734e-05 + - name: pdf-quality-industrial + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.006065498913209619 + - name: pdf-quality-literature + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0010146745393845088 + - name: pdf-quality-politics + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0020744457249638848 + - name: pdf-quality-religion + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0005862564005332716 + - name: pdf-quality-science_tech + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.06710380953796218 + - name: pdf-quality-software + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0013303510627485783 + - name: pdf-quality-software_dev + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.009808520547383584 + - name: pdf-quality-sports_fitness + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.00042841813885123695 + - name: pdf-quality-transportation + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.00024803155407176884 + - name: web-all_dressed-snazzy2-v18-v20--adult_content + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0007173682024900249 + - name: web-all_dressed-snazzy2-v18-v20--art_and_design + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.007923061878802653 + - name: web-all_dressed-snazzy2-v18-v20--crime_and_law + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.018484541742158554 + - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.02708491236992874 + - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.021865526233072138 + - name: web-all_dressed-snazzy2-v18-v20--entertainment + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.06062198490320407 + - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + target_ratio: 8.043593742269167e-05 + - name: web-all_dressed-snazzy2-v18-v20--finance_and_business + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.025649314652329617 + - name: web-all_dressed-snazzy2-v18-v20--food_and_dining + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.00887470150252287 + - name: web-all_dressed-snazzy2-v18-v20--games + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.04346937194513554 + - name: web-all_dressed-snazzy2-v18-v20--health + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.06285889092834201 + - name: web-all_dressed-snazzy2-v18-v20--history_and_geography + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.01716611199932801 + - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.005775252612113678 + - name: web-all_dressed-snazzy2-v18-v20--industrial + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.009913816260282225 + - name: web-all_dressed-snazzy2-v18-v20--literature + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.04303985890886475 + - name: web-all_dressed-snazzy2-v18-v20--politics + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.013650636488803977 + - name: web-all_dressed-snazzy2-v18-v20--religion + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.012475034542286063 + - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.13272067007814822 + - name: web-all_dressed-snazzy2-v18-v20--social_life + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0029045425420069658 + - name: web-all_dressed-snazzy2-v18-v20--software_development + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.029878208110725914 + - name: web-all_dressed-snazzy2-v18-v20--software + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0703638079752142 + - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.008202999702314992 + - name: web-all_dressed-snazzy2-v18-v20--transportation + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0061525510257345265 + - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + target_ratio: 0.0013815748995648469 \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-proposed-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-proposed-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml new file mode 100644 index 000000000..dc391a0bd --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/mixing-proposed-gen-mc-ratios-10B-olmo3_7B_12T-microanneal.yaml @@ -0,0 +1,263 @@ +name: "mixing-gen-mc-proposed-ratios-10B-olmo3-microanneal" +description: "OLMo3 7b 10B mixing gen/MC validation with proposed ratios" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: pdf-quality-art_design + target_ratio: 7.016231376725744e-05 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-crime_law + target_ratio: 0.0015535940905607002 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-education_jobs + target_ratio: 0.005171964843414975 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-entertainment + target_ratio: 0.00022051012898280912 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-finance_business + target_ratio: 0.0014032462753451485 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-hardware + target_ratio: 0.0004811130086897653 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-health + target_ratio: 0.011897523777390655 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-history + target_ratio: 0.0005412521347759859 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-home_hobbies + target_ratio: 7.016231376725744e-05 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-industrial + target_ratio: 0.005392474972397786 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-literature + target_ratio: 0.00090208689129331 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-politics + target_ratio: 0.001844266533310767 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-religion + target_ratio: 0.0005212057594139124 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-science_tech + target_ratio: 0.0596580130775309 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software + target_ratio: 0.0011827361463623398 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software_dev + target_ratio: 0.008720173282501995 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-sports_fitness + target_ratio: 0.0003808811318793975 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-transportation + target_ratio: 0.00022051012898280912 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--adult_content + target_ratio: 0.0006377694783683423 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--art_and_design + target_ratio: 0.007043923920776696 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--crime_and_law + target_ratio: 0.01643350862758409 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs + target_ratio: 0.024079587544949644 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware + target_ratio: 0.01943934120791984 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--entertainment + target_ratio: 0.05389540762354546 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty + target_ratio: 7.151081644555054e-05 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--finance_and_business + target_ratio: 0.022803282846299847 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--food_and_dining + target_ratio: 0.007889970210963536 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--games + target_ratio: 0.03864603780069838 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--health + target_ratio: 0.05588410796440155 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--history_and_geography + target_ratio: 0.015261371018986869 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies + target_ratio: 0.005134434218143875 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--industrial + target_ratio: 0.008813785449387414 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--literature + target_ratio: 0.0382641832605278 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--politics + target_ratio: 0.012135970457906422 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--religion + target_ratio: 0.011090812563261708 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology + target_ratio: 0.1179940680819547 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--social_life + target_ratio: 0.00258225633013083 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software_development + target_ratio: 0.02656294094889634 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software + target_ratio: 0.06255628414055135 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness + target_ratio: 0.007292800019629874 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--transportation + target_ratio: 0.005469867837321492 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism + target_ratio: 0.0012282762184939744 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.062108549485272356 + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy + - name: reddit-high + target_ratio: 0.11904721336433728 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: reddit-low + target_ratio: 0.0009788837065593818 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: rcqa + target_ratio: 0.0012504569520369563 + paths: + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + - name: nemotron-synth-qa + target_ratio: 0.06109484203464316 + paths: + - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy + - name: instruction-new-format + target_ratio: 0.023028775198057915 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy + - name: flan + target_ratio: 0.07104790386157962 + paths: + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/testrun-webv18-redditv1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/testrun-webv18-redditv1-10B-microanneal.yaml new file mode 100644 index 000000000..646d97ea6 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/testrun-webv18-redditv1-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv19-redditv1-10B-olmo3-microanneal" +description: "OLMo3 7b 10B web v18 + reddit v1 microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-finemath-stackedufim-omrfullthoughts-5B-olmo3_7B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-finemath-stackedufim-omrfullthoughts-5B-olmo3_7B-microanneal.yaml new file mode 100644 index 000000000..b68c3589a --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-finemath-stackedufim-omrfullthoughts-5B-olmo3_7B-microanneal.yaml @@ -0,0 +1,41 @@ +name: "math-code-with-omrfullthoughts-olmo3-microanneal-5b" +description: "OLMo3 7b 5B web, math and code with OMR fulthoughts microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 5_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.4 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + target_ratio: 0.3 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: omrfullthoughts + target_ratio: 0.1 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-mathmeta-codemeta-verifiable-5B-olmo3_7B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-mathmeta-codemeta-verifiable-5B-olmo3_7B-microanneal.yaml new file mode 100644 index 000000000..8467bf152 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-mathmeta-codemeta-verifiable-5B-olmo3_7B-microanneal.yaml @@ -0,0 +1,39 @@ +name: "webv18-mathmeta-codemeta-verifiable-5B-olmo3-4T-microanneal" +description: "OLMo3 7b 5B web v18 + math meta + code meta + verifiable microanneal (4T)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 5_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: 0.5 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-only-5B-olmo3_7B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-only-5B-olmo3_7B-microanneal.yaml new file mode 100644 index 000000000..43f379847 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/web-only-5B-olmo3_7B-microanneal.yaml @@ -0,0 +1,29 @@ +name: "webv18-only-5B-olmo3-4T-microanneal" +description: "OLMo3 7b 5B web v18 only microanneal (4T)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 5_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 1.0 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-lowthresh-NON_MC-rewrites-5B-olmo3_7B-4T_microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-lowthresh-NON_MC-rewrites-5B-olmo3_7B-4T_microanneal.yaml new file mode 100644 index 000000000..9d3b4ef1f --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-lowthresh-NON_MC-rewrites-5B-olmo3_7B-4T_microanneal.yaml @@ -0,0 +1,34 @@ +name: "webv18-reddit-lowthresh-nonMC-5B-olmo3-4T-microanneal" +description: "OLMo3 7b 5B web v18 + reddit lowthresh nonMC rewrites (4T)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 5_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.54 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: nonMC_reddit_lowthresh + target_ratio: 0.46 + paths: + #2.32B + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_rewrites_nonmc-tokenized/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-psgqav1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-psgqav1-10B-microanneal.yaml new file mode 100644 index 000000000..cd87a5921 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-psgqav1-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv18-psgqav1-10B-olmo3-microanneal" +description: "OLMo3 7b 10B web v18 + passageQA v1 microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.58 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: psgqa + target_ratio: 0.42 + paths: + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-highthresh-diverseqa-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-highthresh-diverseqa-10B-microanneal.yaml new file mode 100644 index 000000000..ca0210813 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-highthresh-diverseqa-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv18-reddit-highthresh-diverseqa-10B-olmo3-microanneal" +description: "OLMo3 7b 10B web v18 + reddit highthresh with diversified QA formats microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_diverseqa/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-addcontextv1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-addcontextv1-10B-microanneal.yaml new file mode 100644 index 000000000..bc96ba2ea --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-addcontextv1-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv18-reddit-lowthresh-addcontextv1-10B-olmo3-4T-microanneal" +description: "OLMo3 7b 10B web v18 + reddit lowthresh add-context v1 (no choices) microanneal (4T)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_addcontext-v1_tokenized/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-mcplusfull-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-mcplusfull-10B-microanneal.yaml new file mode 100644 index 000000000..f89425f9d --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh-mcplusfull-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv18-reddit-lowthresh-mcplusfull-10B-olmo3-4T-microanneal" +description: "OLMo3 7b 10B web v18 + reddit lowthresh MC plus full answer format microanneal (4T)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_rewrites_mcplusfull_tokenized/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh663-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh663-10B-microanneal.yaml new file mode 100644 index 000000000..2ab84f5ea --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-reddit-lowthresh663-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv18-reddit-lowthresh663-10B-olmo3-microanneal" +description: "OLMo3 7b 10B web v18 + reddit lowthresh 663 microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/lowthresh_rewrites_663_tokenized/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1-10B-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1-10B-microanneal.yaml new file mode 100644 index 000000000..646d97ea6 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1-10B-microanneal.yaml @@ -0,0 +1,33 @@ +name: "webv19-redditv1-10B-olmo3-microanneal" +description: "OLMo3 7b 10B web v18 + reddit v1 microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1and2-10B-olmo3_7B-4T-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1and2-10B-olmo3_7B-4T-microanneal.yaml new file mode 100644 index 000000000..ce64c009d --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/ae_microanneals/webv18-redditv1and2-10B-olmo3_7B-4T-microanneal.yaml @@ -0,0 +1,34 @@ +name: "webv18-redditv1and2-10B-olmo3-4T-microanneal" +description: "OLMo3 7b 10B web v18 + reddit v1 (highthresh) and v2 (lowthresh) microanneal (4T)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-no-reasoning-anneal.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-no-reasoning-anneal.yaml new file mode 100644 index 000000000..c84eff439 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-no-reasoning-anneal.yaml @@ -0,0 +1,57 @@ +name: "anneal-round1-100B-olmo3_7b_no-reasoning-anneal" +description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITHOUT reasoning data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.5 + repetition_factor: 1.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + # 8_923_780_609 tokens + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-12T.yaml new file mode 100644 index 000000000..4210cc2a2 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-12T.yaml @@ -0,0 +1,67 @@ +name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal-12T" +description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data from 12T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T-largebatchLR.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T-largebatchLR.yaml new file mode 100644 index 000000000..888f5c822 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T-largebatchLR.yaml @@ -0,0 +1,68 @@ +name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal-7T-largebatchLR" +description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data from 7T ckpt with larger batch size and LR" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 16777216 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true + initial_lr: 0.00024890158 +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step467000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T.yaml new file mode 100644 index 000000000..6c7c2ea17 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal-7T.yaml @@ -0,0 +1,67 @@ +name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal-7T" +description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data from 7T ckpt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step467000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal.yaml new file mode 100644 index 000000000..ac728febe --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round1-olmo3_7b-with-reasoning-anneal.yaml @@ -0,0 +1,67 @@ +name: "anneal-round1-100B-olmo3_7b_with-reasoning-anneal" +description: "OLMo3 7b anneal to 100B Tokens on midtraining mix WITH reasoning data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stackedu-fim-20pct-natural/allenai/dolma2-tokenizer/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-12T.yaml new file mode 100644 index 000000000..f1d3dedb6 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-12T.yaml @@ -0,0 +1,80 @@ +name: "anneal-round2-100B-olmo3_7b_with-reasoning-anneal-12T" +description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt -- round 2 data mix" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-7T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-7T.yaml new file mode 100644 index 000000000..f2d882b1a --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-7T.yaml @@ -0,0 +1,80 @@ +name: "anneal-round2-100B-olmo3_7b_with-reasoning-anneal-7T" +description: "OLMo3 7b anneal to 100B Tokens from 7T ckpt -- round 2 data mix" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step467000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-8T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-8T.yaml new file mode 100644 index 000000000..96a779ddc --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-olmo3_7b-anneal-8T.yaml @@ -0,0 +1,80 @@ +name: "anneal-round2-100B-olmo3_7b_with-reasoning-anneal-8T" +description: "OLMo3 7b anneal to 100B Tokens from 8T ckpt -- round 2 data mix" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round2-webround1-olmo3_7b-anneal-8T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-webround1-olmo3_7b-anneal-8T.yaml new file mode 100644 index 000000000..6c7ea22da --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round2-webround1-olmo3_7b-anneal-8T.yaml @@ -0,0 +1,80 @@ +name: "anneal-round2-webround1-100B-olmo3_7b_with-reasoning-anneal-8T" +description: "OLMo3 7b anneal to 100B Tokens from 8T ckpt -- round 2 data mix with round 1 web data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-200B.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-200B.yaml new file mode 100644 index 000000000..1f9f964c7 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-200B.yaml @@ -0,0 +1,98 @@ +name: "anneal-round3-webround2-200B-olmo3_7b_with-reasoning-anneal-12T" +description: "OLMo3 7b anneal to 200B Tokens from 12T ckpt -- round 3 data mix with round2 web data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 200_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.425 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0020/*.npy + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0020/*.npy + - name: sponge + target_ratio: 0.025 + paths: + - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-all/allenai/dolma2-tokenizer/**/**/*.npy + - name: megamath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.1366 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.0634 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.092 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - name: instruction + target_ratio: 0.008 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: reasoning + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-noinstruct-noreasoning.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-noinstruct-noreasoning.yaml new file mode 100644 index 000000000..9f176b114 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T-noinstruct-noreasoning.yaml @@ -0,0 +1,85 @@ +name: "anneal-round3-webround2-100B-olmo3_7b_no-reasoning-no-instruct-anneal-12T" +description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt -- round 3 data mix with round2 web data, no instruction or reasoning data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.461 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: megamath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0931 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1069 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + # - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # # 898,733,958 + # - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # # 240,590,634 + # - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + # - name: instruction + # target_ratio: 0.011 + # paths: + # - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + # - name: r1_reasoning + # target_ratio: 0.01875 + # paths: + # - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + # - name: qwq_reasoning + # target_ratio: 0.01875 + # paths: + # - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + # - name: gemini_reasoning + # target_ratio: 0.0025 + # paths: + # - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + # - name: faeze_reasoning + # target_ratio: .01 + # paths: + # #3.166B + # #1,144,531,442 + # - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + # #366,757,554 + # - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + # #1,198,181,281 + # - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T.yaml new file mode 100644 index 000000000..0c5f40496 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-12T.yaml @@ -0,0 +1,85 @@ +name: "anneal-round3-webround2-100B-olmo3_7b_with-reasoning-anneal-12T" +description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt -- round 3 data mix with round2 web data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.40 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: megamath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-8T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-8T.yaml new file mode 100644 index 000000000..366110b96 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round3-webround2-olmo3_7b-anneal-8T.yaml @@ -0,0 +1,85 @@ +name: "anneal-round3-webround2-100B-olmo3_7b_with-reasoning-anneal-8T" +description: "OLMo3 7b anneal to 100B Tokens from 8T ckpt -- round 3 data mix with round2 web data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step527000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.40 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge/sponge_63_mixes/eli5_60%_decon_final/*.npy + - name: code + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: megamath + # 10% less the .0268 over 10% for dolminos2math + target_ratio: 0.0732 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1268 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/allenai/dolma2-tokenizer/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/math-meta-reasoning/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/verifiable/gpt-41/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/pretraining-data/sources/midtraining-reasoning/code-meta-reasoning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round4-olmo3_7b-anneal-decon-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round4-olmo3_7b-anneal-decon-12T.yaml new file mode 100644 index 000000000..957734dd3 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round4-olmo3_7b-anneal-decon-12T.yaml @@ -0,0 +1,105 @@ +name: "anneal-round4-100B-olmo3_7b-anneal-decon-12T" +description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt with decon -- round 4 data mix" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.35 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes_50B/allenai/dolma2-tokenizer/*.npy + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/*/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.2 + paths: + - s3://ai2-llm/preprocessed/stack-edu/fim/documents-decon-2/fim_50pct_psm_50pct/*/allenai/dolma2-tokenizer/*.npy + - name: megamatt + # 20% less the ratio for dolminos2math + # 3_906_854_120 + target_ratio: 0.01698 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.18302 + paths: + # 5_624_449_531 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy + # 10.7B + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy + # 1.99B + # 850,859,688 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + # 240,590,634 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.059 + paths: + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: rcqa + target_ratio: 0.03 + paths: + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + # 2.48B + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + # 4.77B + - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + # 254M + - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: llamanemotron_reasoning + target_ratio: .025 + paths: + # 20.9B + - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy + - name: openthoughts2 + target_ratio: .025 + paths: + # 5.60B + - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + #3.166B + #1,144,531,442 + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + #366,757,554 + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy + #1,198,181,281 + - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/anneal-round5-olmo3_7b-anneal-decon-12T.yaml b/src/cookbook/recipes/olmo3-midtraining/anneal-round5-olmo3_7b-anneal-decon-12T.yaml new file mode 100644 index 000000000..5b998aa08 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/anneal-round5-olmo3_7b-anneal-decon-12T.yaml @@ -0,0 +1,354 @@ +name: "anneal-round5-100B-olmo3_7b-anneal-decon-12T" +description: "OLMo3 7b anneal to 100B Tokens from 12T ckpt with decon -- round 5 data mix" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step756906 +load_state: false +dataset: + sources: + - name: sponge + target_ratio: 0.05 + paths: + - s3://ai2-llm/preprocessed/sponge_63_mixes/eli5_60pct_filter-decon-2/**/allenai/dolma2-tokenizer/*.npy + - name: code_fim + target_ratio: 0.1 + paths: + # 21,390,279,634 + - s3://ai2-llm/preprocessed/stack-edu/sample-fim-weighted-pl-edu-score-decon/**/**/*.npy + - name: swallowcode + target_ratio: 0.1 + paths: + # 18,833,636,683 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data-decon/allenai/dolma2-tokenizer/*.npy + - name: megamatt + # 20% less the ratio for dolminos2math + target_ratio: 0.01698 + paths: + # 3,883,674,937 + - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-2/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.18302 + paths: + # 5_624_449_531 + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy + # 10,687,987,907 + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math-decon-2/allenai/dolma2-tokenizer/*.npy + # 1.99B + # 850,848,999 + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/jsonls-decon-2/allenai/dolma2-tokenizer/*.npy + # 898,733,958 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + # 240,590,380 + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-2/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.059 + paths: + # 9,860,465,314 + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_highthresh_microanneal_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 21,390,279,634 + - s3://ai2-llm/preprocessed/reddit-rewrites/densesub_lowthresh_4omini_rewrite/documents-decon-2/allenai/dolma2-tokenizer/*.npy + - name: rcqa + target_ratio: 0.03 + paths: + # 4,215,210,848 + - s3://ai2-llm/preprocessed/wiki_psgqa_rewrites/psgqa_rewrites_v1-decon-2/allenai/dolma2-tokenizer/*.npy + - name: nemotron-synth-qa + target_ratio: .05 + paths: + # 486,558,362,887 + - s3://ai2-llm/preprocessed/Nemotron-CC/v0/quality=high/kind=synthetic/kind2=diverse_qa_pairs-decon/allenai/dolma2-tokenizer/*.npy + # - name: instruction + # target_ratio: 0.0 + # paths: + # # 1,627,593,038 + # - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw-decon-2/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/allenai/dolma2-tokenizer/*.npy + - name: instruction-new-format + target_ratio: 0.011 + paths: + # 1,639,399,859 + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/raw/tulu-3-midtrain-v0-data-simple-concat-with-therefore-template-decon/allenai/dolma2-tokenizer/*.npy + - name: flan + target_ratio: .05 + paths: + # 17,055,708,123 + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased-decon/allenai/dolma2-tokenizer/*.npy + - name: r1_reasoning + target_ratio: 0.01875 + paths: + # 2,483,453,165 + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces-reformatted-keyword-filter-datecutoff-chinese-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.01875 + paths: + # 4,774,150,082 + - s3://ai2-llm/preprocessed/thinking-data/qwq-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + # 254,415,258 + - s3://ai2-llm/preprocessed/thinking-data/gemini-redo-reformatted-ngram-no-special-tokens-decon-2/allenai/dolma2-tokenizer/*.npy + - name: llamanemotron_reasoning + target_ratio: .0125 + paths: + # 20.9B + - s3://ai2-llm/preprocessed/thinking-data/llama-nemotron-processed-chinese-filtered-ngram-filtered-with-token-counts-decon-2/allenai/dolma2-tokenizer/*.npy + - name: openthoughts2 + target_ratio: .0125 + paths: + # 5,601,836,260 + - s3://ai2-llm/preprocessed/thinking-data/openthoughts2-filtered-chinese-filtered-ngram-filtered-with-token-counts-stripped-decon-2/allenai/dolma2-tokenizer/*.npy + - name: faeze_reasoning + target_ratio: .01 + paths: + # ORIGINALS + # 1,198,073,462 --> actually code meta reasoning + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 366,757,554 --> has special tokens + # - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 1,198,074,765 --> different version of code meta reasoning + # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning/documents-decon-2/allenai/dolma2-tokenizer/*.npy + #### + # FIXES + # 1,049,524,455 --> actual math meta reasoning + - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-fixed-decon/allenai/dolma2-tokenizer/*.npy + # 364,483,656 --> verifiable/gpt-41 without special tokens + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/gpt-41/cleaned-documents-decon-2/allenai/dolma2-tokenizer/*.npy + # 73,752,560 --> verifiable/gpt-o4-mini without special tokens + - s3://ai2-llm/preprocessed/midtraining-reasoning/verifiable/o4-mini-high-decon/allenai/dolma2-tokenizer/*.npy + #### + # NEW FORMAT + # 1,057,302,754 --> actual math meta-reasoning with latex format + # - s3://ai2-llm/preprocessed/midtraining-reasoning/math-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy + # 1,199,242,351 --> actual code meta-reasoning with latex format + # - s3://ai2-llm/preprocessed/midtraining-reasoning/code-meta-reasoning-latex-delim-decon/allenai/dolma2-tokenizer/*.npy + + # PDF 0.05 + - name: pdf-quality-art_design + target_ratio: 0.000035 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/art_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/art_design/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-crime_law + target_ratio: 0.000775 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/crime_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/crime_law/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-education_jobs + target_ratio: 0.00258 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/education_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/education_jobs/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-entertainment + target_ratio: 0.00011 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/entertainment/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-finance_business + target_ratio: 0.0007 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/finance_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/finance_business/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-hardware + target_ratio: 0.00024 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/hardware/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-health + target_ratio: 0.005935 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/health/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-history + target_ratio: 0.00027 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/history/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/history/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-home_hobbies + target_ratio: 0.000035 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/home_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/home_hobbies/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-industrial + target_ratio: 0.00269 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/industrial/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-literature + target_ratio: 0.00045 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/literature/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-politics + target_ratio: 0.00092 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/politics/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-religion + target_ratio: 0.00026 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/religion/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-science_tech + target_ratio: 0.02976 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/science_tech/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/science_tech/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software + target_ratio: 0.00059 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-software_dev + target_ratio: 0.00435 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/software_dev/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/software_dev/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-sports_fitness + target_ratio: 0.00019 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/sports_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/sports_fitness/allenai/dolma2-tokenizer/*.npy + - name: pdf-quality-transportation + target_ratio: 0.00011 + paths: + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e12/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/s2pdf_dedupe_minhash_v1_with_no_pii_basic_quality_datadelve_norefs_mdtables_v2_denylisted_reshard_length-buckets_compression-decon-2/length_2e13/transportation/allenai/dolma2-tokenizer/*.npy + + # Web 0.225 + - name: web-all_dressed-snazzy2-v18-v20--adult_content + target_ratio: 0.0002556935 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/adult_content/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--art_and_design + target_ratio: 0.0028240385 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/art_and_design/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--crime_and_law + target_ratio: 0.0065884955 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/crime_and_law/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--education_and_jobs + target_ratio: 0.0096539490 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/education_and_jobs/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--electronics_and_hardware + target_ratio: 0.0077935890 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/electronics_and_hardware/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--entertainment + target_ratio: 0.0216076590 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/entertainment/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--fashion_and_beauty + target_ratio: 0.0000286700 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/fashion_and_beauty/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--finance_and_business + target_ratio: 0.0091422550 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/finance_and_business/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--food_and_dining + target_ratio: 0.0031632340 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/food_and_dining/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--games + target_ratio: 0.0154939065 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/games/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--health + target_ratio: 0.0224049655 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/health/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--history_and_geography + target_ratio: 0.0061185640 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/history_and_geography/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--home_and_hobbies + target_ratio: 0.0020584890 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/home_and_hobbies/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--industrial + target_ratio: 0.0035336085 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/industrial/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--literature + target_ratio: 0.0153408140 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/literature/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--politics + target_ratio: 0.0048655335 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/politics/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--religion + target_ratio: 0.0044465105 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/religion/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--science_math_and_technology + target_ratio: 0.0473059895 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/science_math_and_technology/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--social_life + target_ratio: 0.0010352740 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/social_life/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software_development + target_ratio: 0.0106495710 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software_development/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--software + target_ratio: 0.0250799635 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/software/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--sports_and_fitness + target_ratio: 0.0029238175 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/sports_and_fitness/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--transportation + target_ratio: 0.0021929705 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/transportation/allenai/dolma2-tokenizer/*.npy + - name: web-all_dressed-snazzy2-v18-v20--travel_and_tourism + target_ratio: 0.0004924385 + paths: + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0018_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/all_dressed_v3_weborganizer_ft_dclm_plus2_vigintiles/vigintile_0020_subset-decon-2/travel_and_tourism/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/dolminos-baseline-anneal-olmo3_7b-100B.yaml b/src/cookbook/recipes/olmo3-midtraining/dolminos-baseline-anneal-olmo3_7b-100B.yaml new file mode 100644 index 000000000..9327407b5 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/dolminos-baseline-anneal-olmo3_7b-100B.yaml @@ -0,0 +1,1185 @@ +name: "baseline-dolminos-anneal-100B-olmo3_7b" +description: "Baseline dolmino mix OLMo3 7b anneal to 100B Tokens" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.5 + repetition_factor: 1.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: dolmino + target_ratio: 0.5 + paths: + #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/pes2o/allenai/dolma2-tokenizer/ (9.76BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml index 38a3608e2..af77a48eb 100644 --- a/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml +++ b/src/cookbook/recipes/olmo3-midtraining/example-olmo2_7b-web-code-reasoning-microanneal.yaml @@ -11,7 +11,7 @@ sequence_length: 4096 seed: 1337 model: "olmo2_7B" tokenizer: "dolma2" -priority: urgent +priority: high eval_interval: 250 cluster: ai2/augusta-google-1 rank_microbatch_size: 8192 diff --git a/src/cookbook/recipes/olmo3-midtraining/example-olmo3_7b-microanneal.yaml b/src/cookbook/recipes/olmo3-midtraining/example-olmo3_7b-microanneal.yaml new file mode 100644 index 000000000..fa7c2d451 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/example-olmo3_7b-microanneal.yaml @@ -0,0 +1,33 @@ +name: "example-olmo3-microanneal" +description: "OLMo3 7b 10B example microanneal" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: code + target_ratio: 0.5 + paths: + - gs://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/**/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/lr-test-dolminos-baseline-anneal-olmo3_7b-100B.yaml b/src/cookbook/recipes/olmo3-midtraining/lr-test-dolminos-baseline-anneal-olmo3_7b-100B.yaml new file mode 100644 index 000000000..1eeb594c6 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/lr-test-dolminos-baseline-anneal-olmo3_7b-100B.yaml @@ -0,0 +1,1185 @@ +name: "lr-test-baseline-dolminos-anneal-100B-olmo3_7b" +description: "LR test version of baseline dolmino mix OLMo3 7b anneal to 100B Tokens" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 16 +gpus: 8 +preemptible: true +max_tokens: 100_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix-from283000-200B/step288419 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.5 + repetition_factor: 1.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: dolmino + target_ratio: 0.5 + paths: + #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer/ (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/ (191.58MT) + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/part-10-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/dolma2-tokenizer (9.03MT) + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/part-08-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/ (1.08MT) + - s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/ (17.06MT) + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/part-60-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/ (1.23MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/ (1.51MT) + - s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/owm-filtered-math/metamath/ (84.22MT) + - s3://ai2-llm/preprocessed/owm-filtered-math/metamath/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/ (2.21MT) + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-5-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-0-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/part-1-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/ (782.58MT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/part-02-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/ (3.09BT) + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/part-28-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind/dolma2-tokenizer/ (3.06BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/part-82-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/ (1.26BT) + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/ (3.66BT) + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/ (21.80MT) + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy + - s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/ (19.74MT) + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-1-00000.npy + - s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/part-0-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/ (17.08BT) + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/part-11-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/pes2o/allenai/dolma2-tokenizer/ (9.76BT) + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + #SOURCE: http://olmo-data.org/preprocessed/tinyGSM/mind-2students/ (3.41BT) + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-91-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/part-04-00000.npy + \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_dolminos-math.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_dolminos-math.yaml new file mode 100644 index 000000000..efb2de5e5 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_dolminos-math.yaml @@ -0,0 +1,34 @@ +name: "microanneal-dolminos_math_baseline" +description: "OLMo3_4T 10B-microanneal with flat dolminos math dataset as a baseline" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math # 10.69B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_finemath.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_finemath.yaml new file mode 100644 index 000000000..321a42990 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_finemath.yaml @@ -0,0 +1,34 @@ +name: "microanneal-finemath" +description: "OLMo3_4T 10B-microanneal with finemath" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: fineMath # 34.06B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web-pro.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web-pro.yaml new file mode 100644 index 000000000..a2ebebaae --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web-pro.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj_megamath-web-pro +description: microanneal just for megamath-web-pro (subsampled) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web/megamath-web-pro/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web.yaml new file mode 100644 index 000000000..8b6389e2a --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_megamath-web.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj_megamath-web +description: microanneal just for megamath-web (subsampled) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web/megamath-web_reshard-0.05/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math-bestof.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math-bestof.yaml new file mode 100644 index 000000000..1883186ba --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math-bestof.yaml @@ -0,0 +1,39 @@ +name: "microanneal-mjnewmath_bestof" +description: "OLMo3_4T 10B-microanneal with mj's new math data and the best of Scott's new stuff" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 24.7B tokens | 8.01B requested + target_ratio: 0.80102 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: tinyMATH_MIND # 1.139B tokens | 1.139B requested + target_ratio: 0.1139 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: fullthoughts # 0.85085B tokens | 0.85BB requested + target_ratio: 0.08508 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math.yaml new file mode 100644 index 000000000..ad34104f8 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-new-math.yaml @@ -0,0 +1,39 @@ +name: "microanneal-mjnewmath" +description: "OLMo3_4T 10B-microanneal with mj's new math data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: tinyMATH_MIND # 1.139B tokens | 1.139B requested + target_ratio: 0.1139 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: fineMath # 34.06B tokens | 3.861B requested + target_ratio: 0.3861 + paths: + - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-swallowcode.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-swallowcode.yaml new file mode 100644 index 000000000..88d159430 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mj-swallowcode.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj-swallowcode +description: microanneal just for (10B tokens of) swallowcode +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/original_data/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mmwpm.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mmwpm.yaml new file mode 100644 index 000000000..520de4400 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_mmwpm.yaml @@ -0,0 +1,34 @@ +name: "microanneal-megamathwebpromax" +description: "OLMo3_4T 10B-microanneal with 5T tokens of megamathwebpromax" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 73.9B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_omr-rewrites.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_omr-rewrites.yaml new file mode 100644 index 000000000..a9dec5338 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_omr-rewrites.yaml @@ -0,0 +1,46 @@ +name: "microanneal-omr-rewrites" +description: "OLMo3_4T 10B-microanneal with a bunch of openmathreasoning rewrites" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: openmathreasoning # 2.966B tokens | 2.965B requested + target_ratio: 0.2965 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-dialogue-2students-error-correct/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-sleek/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason-noheader/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-reformulation/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-lecture/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-planning/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-clean-thoughts/allenai/dolma2-tokenizer/*.npy + - name: fineMath # 34.06B tokens | 2.034B requested + target_ratio: 0.2035 + paths: + - s3://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_web.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_web.yaml new file mode 100644 index 000000000..668b6e9e4 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-10B-micro_web.yaml @@ -0,0 +1,31 @@ +name: "microanneal-web_ad18" +description: "OLMo3_4T 10B-microanneal with just web, cc-all-dressed_18" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 24.7B tokens | 10B requested + target_ratio: 1.0 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-25B-milli_mmwpm.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-25B-milli_mmwpm.yaml new file mode 100644 index 000000000..08dcc356f --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-25B-milli_mmwpm.yaml @@ -0,0 +1,34 @@ +name: "millianneal-megamathwebpromax" +description: "OLMo3_4T 25B-anneal with 12.5T tokens of megamathwebpromax" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 25_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 25B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 73.9B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-mjicro_mj-swallowmath.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-mjicro_mj-swallowmath.yaml new file mode 100644 index 000000000..938fca9a5 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_anneals/olmo3_4T-mjicro_mj-swallowmath.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-swallowmath +description: microanneal just for (all of) swallowmath +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 7306856190 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/original_data/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_conv.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_conv.yaml new file mode 100644 index 000000000..56cf7e170 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_conv.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-kodkode_4o_conv +description: nanonanneal just for kodkode version 4o_conv +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 442515950 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_4o_conv_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qst.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qst.yaml new file mode 100644 index 000000000..3e226517c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qst.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-kodkode_4o_qst +description: nanonanneal just for kodkode version 4o_qst +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 378311140 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_4o_qst_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qstp.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qstp.yaml new file mode 100644 index 000000000..2b43c618b --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_4o_qstp.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-kodkode_4o_qstp +description: nanonanneal just for kodkode version 4o_qstp +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 438489702 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_4o_qstp_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_conv.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_conv.yaml new file mode 100644 index 000000000..92cd24884 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_conv.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-kodkode_r1_conv +description: nanonanneal just for kodkode version r1_conv +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 2409101190 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_r1_conv_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qst.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qst.yaml new file mode 100644 index 000000000..f67d651bd --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qst.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-kodkode_r1_qst +description: nanonanneal just for kodkode version r1_qst +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 377930418 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_r1_qst_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qstp.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qstp.yaml new file mode 100644 index 000000000..3f1af1281 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/kodkode_v1/kodkode_r1_qstp.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-kodkode_r1_qstp +description: nanonanneal just for kodkode version r1_qstp +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 365577152 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/kodkode/kodkode_v1_sft_r1_qstp_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-10B-megamatt_test_og.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-10B-megamatt_test_og.yaml new file mode 100644 index 000000000..86cb06e3a --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-10B-megamatt_test_og.yaml @@ -0,0 +1,34 @@ +name: "microanneal-megamattwebpromax-megamathtest" +description: "OLMo3_4T 10B-microanneal with 5T tokens of megamath (original, pre-rewrite)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 73.9B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/megamath_web_pro_max/0731_scratch/og_tokens/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-5B-megamatt_test_rewrite.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-5B-megamatt_test_rewrite.yaml new file mode 100644 index 000000000..e482d5806 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-5B-megamatt_test_rewrite.yaml @@ -0,0 +1,34 @@ +name: "microanneal-megamattwebpromax-megamathtest_rewrites" +description: "OLMo3_4T 5B-microanneal with ~2.5T tokens of megamatt, rewritten" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 5_373_701_500 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | ~2.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 2.68B tokens | 2.68B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/megamath_web_pro_max/0731_scratch/rewrite_tokens/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-mjicro-megamatt.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-mjicro-megamatt.yaml new file mode 100644 index 000000000..ec5cb6408 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/megamatt/olmo3_4T-mjicro-megamatt.yaml @@ -0,0 +1,34 @@ +name: "mjicroanneal-megamatt" +description: "OLMo3_4T mjicroanneal with 3.88B tokens of megamatt" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 1 +gpus: 8 +preemptible: false +max_tokens: 7_767_300_882 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6.17B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: megamatt # 3.88B tokens + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web_pro_max/beaker_rewrites-decon-sparkle-motion/**/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_clean_thoughts.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_clean_thoughts.yaml new file mode 100644 index 000000000..b41f494b3 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_clean_thoughts.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_clean_thoughts +description: nanonanneal just for omr_clean_thoughts +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 736_917_144 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-clean-thoughts/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_dialogue_2students_error_correct.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_dialogue_2students_error_correct.yaml new file mode 100644 index 000000000..5a7a0aa87 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_dialogue_2students_error_correct.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_dialogue_2students_error_correct +description: nanonanneal just for omr_dialogue_2students_error_correct +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 291_366_340 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-dialogue-2students-error-correct/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_full_thoughts.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_full_thoughts.yaml new file mode 100644 index 000000000..fb476de1f --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_full_thoughts.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_full_thoughts +description: nanonanneal just for omr_full_thoughts +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 1_701_719_376 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-full-thoughts/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason.yaml new file mode 100644 index 000000000..82abf23f5 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_metareason +description: nanonanneal just for omr_metareason +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 716_777_332 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason_noheader.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason_noheader.yaml new file mode 100644 index 000000000..4ec457485 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_metareason_noheader.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_metareason_noheader +description: nanonanneal just for omr_metareason_noheader +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 725_679_384 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-metareason-noheader/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_sleek.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_sleek.yaml new file mode 100644 index 000000000..1c4553634 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_sleek.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_sleek +description: nanonanneal just for omr_sleek +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 454_093_930 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-sleek/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_lecture.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_lecture.yaml new file mode 100644 index 000000000..9f4c9e65d --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_lecture.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_teacher_student_lecture +description: nanonanneal just for omr_teacher_student_lecture +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 913_258_022 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-lecture/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_planning.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_planning.yaml new file mode 100644 index 000000000..e60e42c85 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_planning.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_teacher_student_planning +description: nanonanneal just for omr_teacher_student_planning +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 184_938_956 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-planning/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_reformulation.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_reformulation.yaml new file mode 100644 index 000000000..ef79a0d63 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-omr_teacher_student_reformulation.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj-omr_teacher_student_reformulation +description: nanonanneal just for omr_teacher_student_reformulation +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 207_125_496 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/OpenMathReasoning/OpenMathReasoning-rewrite-teacher-student-reformulation/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH.yaml new file mode 100644 index 000000000..72b2c502d --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH.yaml @@ -0,0 +1,35 @@ +name: nanoanneal-mj-tinyMATH_BOTH +description: PoT/MIND only side of TinyMATH +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 2_309_538_288 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH_mjim.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH_mjim.yaml new file mode 100644 index 000000000..e5175fc41 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_BOTH_mjim.yaml @@ -0,0 +1,36 @@ +name: nanoanneal-mj-tinyMATH_BOTH_mjim +description: PoT/MIND only side of TinyMATH ++ mjim +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 2_528_755_552 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_MIND.yaml new file mode 100644 index 000000000..52181afd5 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_MIND.yaml @@ -0,0 +1,34 @@ +name: nanoanneal-mj-tinyMATH_MIND +description: MIND only side of TinyMATH +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 1_828_357_020 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_PoT.yaml new file mode 100644 index 000000000..1ff74d9ec --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/olmo3_4T-nano-tinyMATH_PoT.yaml @@ -0,0 +1,34 @@ +name: nanoanneal-mj-tinyMATH_PoT +description: PoT only side of TinyMATH +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 512_070_372 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/basic_math_mj/**/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_og.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_og.yaml new file mode 100644 index 000000000..6b46f222c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_og.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj_stackedurw-sanity_og +description: stack-edu python rewrites sanity check | og version +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 1882533086 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/stack-edu/rewrites/sanity_check/og_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_rewrite.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_rewrite.yaml new file mode 100644 index 000000000..0af61956c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/stackedu_rewrites/stackedurw_sanity_rewrite.yaml @@ -0,0 +1,33 @@ +name: nanoanneal-mj_stackedurw-sanity_rewrite +description: stack-edu python rewrites sanity check | rewrite version +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 2046268996 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/stack-edu/rewrites/sanity_check/rewrite_tokens/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-1B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-1B.yaml new file mode 100644 index 000000000..fb952f576 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-1B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-1B +description: OLMo3_4T 10B-microanneal with 1B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 1_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-2B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-2B.yaml new file mode 100644 index 000000000..4441991db --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-2B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-2B +description: OLMo3_4T 10B-microanneal with 2B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 2_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-3B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-3B.yaml new file mode 100644 index 000000000..d503e4375 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-3B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-3B +description: OLMo3_4T 10B-microanneal with 3B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 3_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-4B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-4B.yaml new file mode 100644 index 000000000..43e2c4270 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-4B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-4B +description: OLMo3_4T 10B-microanneal with 4B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 4_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-5B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-5B.yaml new file mode 100644 index 000000000..37ca2ec7b --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-5B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-5B +description: OLMo3_4T 10B-microanneal with 5B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 5_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-6B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-6B.yaml new file mode 100644 index 000000000..b720e5f7b --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-6B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-6B +description: OLMo3_4T 10B-microanneal with 6B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 6_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-7B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-7B.yaml new file mode 100644 index 000000000..3cc791fcc --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-7B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-7B +description: OLMo3_4T 10B-microanneal with 7B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 7_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-8B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-8B.yaml new file mode 100644 index 000000000..82bb80ed1 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-8B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-8B +description: OLMo3_4T 10B-microanneal with 8B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 8_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-9B.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-9B.yaml new file mode 100644 index 000000000..bd86da485 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/submodular_dolmino_curves/olmo3_4T-nano_dolmino-9B.yaml @@ -0,0 +1,33 @@ +name: microanneal-dolminos_math_baseline-9B +description: OLMo3_4T 10B-microanneal with 9B dolminos math dataset tokens +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 9_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: dolminos_math + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/flat_dolmino_math/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ1.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ1.yaml new file mode 100644 index 000000000..8dc900261 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ1.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcode2_scor_lintQ1" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the bottom quartile of linted data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 5.28B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q1/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ2.yaml new file mode 100644 index 000000000..ffe8c3e6a --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ2.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcode2_scor_lintQ2" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the second quartile of linted data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 6.07B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q2/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ3.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ3.yaml new file mode 100644 index 000000000..049d323fd --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ3.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcode2_scor_lintQ3" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the third quartile of linted data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 6.37B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q3/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ4.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ4.yaml new file mode 100644 index 000000000..f48dab7e6 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-lintQ4.yaml @@ -0,0 +1,34 @@ +name: "microanneal-swallowcode2_scor_lintQ4" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the fourth quartile of linted data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 4.54B tokens | 5B requested + target_ratio: 0.5 + repetition_factor: 1.2 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/lint/q4/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ1.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ1.yaml new file mode 100644 index 000000000..eb3abc365 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ1.yaml @@ -0,0 +1,34 @@ +name: "microanneal-swallowcode2_scor_sgcrQ1" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the bottom quartile of SGCR data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 2.11B tokens | 5B requested + target_ratio: 0.5 + repetition_factor: 3.0 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q1/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ2.yaml new file mode 100644 index 000000000..6c6283edc --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ2.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcode2_scor_sgcrQ2" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the second quartile of SGCR data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 2.11B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q2/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ3.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ3.yaml new file mode 100644 index 000000000..5cb33ccac --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ3.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcode2_scor_sgcrQ3" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the third quartile of SGCR data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 11.98B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q3/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ4.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ4.yaml new file mode 100644 index 000000000..77d9b023b --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2-sgcrQ4.yaml @@ -0,0 +1,34 @@ +name: "microanneal-swallowcode2_scor_sgcrQ4" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only) | Just the fourth quartile of SGCR data" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 2.06B tokens | 5B requested + target_ratio: 0.5 + repetition_factor: 3.0 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q4/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2.yaml new file mode 100644 index 000000000..52d78302e --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode2.yaml @@ -0,0 +1,36 @@ +name: "microanneal-swallowcode2_scor" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode2 with SCOR rewrites (improved code only)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 22B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q1/*.npy + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q2/*.npy + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q3/*.npy + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q4/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode_scor_improved_code.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode_scor_improved_code.yaml new file mode 100644 index 000000000..fd744a753 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-microanneal-swallowcode_scor_improved_code.yaml @@ -0,0 +1,34 @@ +name: "microanneal-swallowcode_scor" +description: "OLMo3_4T 10B-anneal with 5T tokens of swallowcode with SCOR rewrites (improved code only)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 1 +gpus: 8 +preemptible: false +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 18.8B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode2.yaml new file mode 100644 index 000000000..7c9073db7 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode2.yaml @@ -0,0 +1,36 @@ +name: "millianneal-swallowcode2_scor" +description: "OLMo3_4T 25B-anneal with 12.5T tokens of swallowcode2 with SCOR rewrites (improved code only)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 25_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 25B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # 22B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q1/*.npy + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q2/*.npy + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q3/*.npy + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/full_ts2_flow/microanneal_partitions/sgcr/q4/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_scor_improved_code.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_scor_improved_code.yaml new file mode 100644 index 000000000..1fb9d82c0 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_scor_improved_code.yaml @@ -0,0 +1,34 @@ +name: "millianneal-swallowcode_scor" +description: "OLMo3_4T 25B-anneal with 12.5T tokens of swallowcode with SCOR rewrites (improved code only)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 25_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 25B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 18.8B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/scor_final_data/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_sgcr_improved_code.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_sgcr_improved_code.yaml new file mode 100644 index 000000000..8c71518be --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-millianneal-swallowcode_sgcr_improved_code.yaml @@ -0,0 +1,34 @@ +name: "millianneal-swallowcode_sgcr" +description: "OLMo3_4T 25B-anneal with 12.5T tokens of swallowcode with SGCR rewrites (improved code only)" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 25_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 25B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: megamathwebpromax # 14.37B tokens | 12.5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode/sgcr_improved_code_tokens/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample.yaml new file mode 100644 index 000000000..867c8048c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcodeMulti_sample" +description: "OLMo3_4T | Using the SGCR rewrites of the multiPL sample" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 7_045_850_528 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_sgcr_data/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sampleOG.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sampleOG.yaml new file mode 100644 index 000000000..4e71abc11 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sampleOG.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcodeMulti_sampleOG" +description: "OLMo3_4T | Using the OG data rewritten in the multiPL sample" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 6_860_126_676 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_sgcr_data_og/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scor.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scor.yaml new file mode 100644 index 000000000..b7c75f85c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scor.yaml @@ -0,0 +1,33 @@ +name: "microanneal-swallowcodeMulti_sample_scor" +description: "OLMo3_4T | Using the SCOR rewrites of the multiPL sample" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_915_766_938 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: normal +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web # 6B tokens | 5.45B requested + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 # + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_scor2_data/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scorpy.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scorpy.yaml new file mode 100644 index 000000000..cc495ab77 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowcode/olmo3_4T-mjicroanneal-swallowcodeMulti_sample_scorpy.yaml @@ -0,0 +1,34 @@ +name: "microanneal-swallowcodeMulti_sample_scorPy" +description: "OLMo3_4T | Using the (pythonic) SCOR rewrites of the multiPL sample" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 15_518_309_282 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: swallowcode2 + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowcode_pls/sample_scor_data_py/allenai/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2.yaml new file mode 100644 index 000000000..08e60ceb6 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj-swallowmathtest-swallowmatt2 +description: microanneal just for 5B tokens of swallowmath rewrites (v2, from megamath_web_pro) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: false +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web/swallow_mmw/beaker_outputs/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2OG.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2OG.yaml new file mode 100644 index 000000000..a8aec2b05 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2OG.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj-swallowmathtest-swallowmatt2OG +description: microanneal just for 5B tokens of swallowmath v2 data (before rewrites, i.e. sanitized megamathwebpro) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web/megamath_web_pro_max_sansFm4p_dedup/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2_restart.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2_restart.yaml new file mode 100644 index 000000000..a7fe67539 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmatt2_restart.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj-swallowmathtest-swallowmatt2_restart +description: microanneal just for 5B tokens of swallowmath rewrites (v2, from megamath_web_pro) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/mattj/microanneal-mj-swallowmathtest-swallowmatt2-3b4f423b/step3000 +load_state: true +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/megamath_web/swallow_mmw/beaker_outputs/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmattPROPER.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmattPROPER.yaml new file mode 100644 index 000000000..327e2ee92 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-micro_mj-swallowmathtest_swallowmattPROPER.yaml @@ -0,0 +1,33 @@ +name: microanneal-mj-swallowmathtest-swallowmattPROPER +description: microanneal just for 5B tokens of swallowmath rewrites +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/tokyotech-llm/swallowmath/beaker_outputs-decon-2/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_mix.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_mix.yaml new file mode 100644 index 000000000..3d4941058 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_mix.yaml @@ -0,0 +1,37 @@ +name: mjicroanneal-mj-swallowmathtest-swallowmatt-mix_proper +description: microanneal to have mix of rewrite and og data +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: OG DATA + target_ratio: 0.25 + paths: + - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/rewrite_diversity_check/og_tokens/allenai/dolma2-tokenizer/*.npy + - name: REWRITE_DATA + target_ratio: 0.25 + paths: + - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/rewrite_diversity_check/rewrite_tokens/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_rewritex2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_rewritex2.yaml new file mode 100644 index 000000000..5959307ae --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_div_rewritex2.yaml @@ -0,0 +1,34 @@ +name: mjicroanneal-mj-swallowmathtest-swallowmatt-rewritex2_proper +description: microanneal to have 2x rewrites +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + repetition_factor: 2.0 + paths: + - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/rewrite_diversity_check/rewrite_tokens/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_fm4p.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_fm4p.yaml new file mode 100644 index 000000000..2573b003a --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_fm4p.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-swallowmathtest-fm4p +description: microanneal just for just the finemath4plus version of swallowmath (i.e., og requests) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 13783479380 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/tokens/og_tokens/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_swallowmatt.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_swallowmatt.yaml new file mode 100644 index 000000000..aca1cb9ed --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/swallowmath/olmo3_4T-mjicro_mj-swallowmathtest_swallowmatt.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-swallowmathtest-swallowmatt +description: microanneal just for just what we have as swallowmatt rewrites of swallowmath go (i.e., the non-llama rewrites) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 8673251024 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/24.7B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/pretraining-data/sources/tokyotech-llm/swallowmath/0731_scratch/tokens/rewritten_tokens/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATH2PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATH2PoT.yaml new file mode 100644 index 000000000..0af38d779 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATH2PoT.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal2.5-mj-tinyMATHPoT2 +description: mjicroanneal just for the PoT of TinyMATH2 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 414_687_378 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo25/step1413814 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATHPoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATHPoT.yaml new file mode 100644 index 000000000..972c18336 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo25_6T-mjicroanneal-tinyMATHPoT.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal2.5-mj-tinyMATHPoT +description: mjicroanneal just for the decon of the OG tinyMATH +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 481_181_268 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo25/step1413814 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2MIND.yaml new file mode 100644 index 000000000..09875393c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2MIND.yaml @@ -0,0 +1,34 @@ +name: mjicroanneal-mj-tinyMATH2MIND +description: mjicroanneal just for the MIND of TinyMATH2 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 2_264_755_032 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_ps_merged/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_2stud_merged/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2PoT.yaml new file mode 100644 index 000000000..b7656ded3 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH2PoT.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATHPoT2 +description: mjicroanneal just for the PoT of TinyMATH2 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 414_687_378 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3MIND.yaml new file mode 100644 index 000000000..8b3c5b779 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3MIND.yaml @@ -0,0 +1,34 @@ +name: mjicroanneal-mj-tinyMATH3MIND +description: mjicroanneal just for the MIND of TinyMATH3 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 2_158_600_528 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/2stud_tokens/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/ps_tokens/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3PoT.yaml new file mode 100644 index 000000000..bb88d1818 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH3PoT.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATH3PoT +description: mjicroanneal just for the PoT of TinyMATH3 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 1_078_693_624 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/pot_data/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND.yaml new file mode 100644 index 000000000..d9a5768d1 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND.yaml @@ -0,0 +1,34 @@ +name: mjicroanneal-mj-tinyMATH4MIND +description: mjicroanneal just for the MIND of TinyMATH4 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 774_164_830 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/2stud_data/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/ps_data/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND_uncurse.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND_uncurse.yaml new file mode 100644 index 000000000..00a074b6c --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4MIND_uncurse.yaml @@ -0,0 +1,34 @@ +name: mjicroanneal-mj-tinyMATH4MIND_uncurse +description: mjicroanneal just for the MIND of TinyMATH4 (uncurse version) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 774_164_830 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 42_069 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: normal +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/2stud_data/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/ps_data/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT.yaml new file mode 100644 index 000000000..f821a24f1 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATH4PoT +description: mjicroanneal just for the PoT of TinyMATH4 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 786_891_408 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2.yaml new file mode 100644 index 000000000..fe7a245cb --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATH4PoT2 +description: mjicroanneal just for the PoT2 of TinyMATH4 +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 479_294_178 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data2/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2_uncurse.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2_uncurse.yaml new file mode 100644 index 000000000..9272e4425 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH4PoT2_uncurse.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATH4PoT2_uncurse +description: mjicroanneal just for the PoT2 of TinyMATH4 (rerun, uncursed?) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 479_294_178 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 42_069 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: normal +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data2/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHMIND.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHMIND.yaml new file mode 100644 index 000000000..b53792ca8 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHMIND.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATHMIND +description: mjicroanneal just for the MIND of TinyMATH +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 1_797_460_446 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHPoT.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHPoT.yaml new file mode 100644 index 000000000..f6b66d223 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATHPoT.yaml @@ -0,0 +1,33 @@ +name: mjicroanneal-mj-tinyMATHPoT +description: mjicroanneal just for the decon of the OG tinyMATH +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 481_181_268 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy.yaml new file mode 100644 index 000000000..4c669e2ea --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy.yaml @@ -0,0 +1,41 @@ +name: mjicroanneal-mj-tinyMATH_allholy +description: mjicroanneal for the MIND/PoT of TinyMATH's 1-4 (minus the cursed sets) +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 8_554_855_728 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: normal +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_ps_merged/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_2stud_merged/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/2stud_tokens/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/ps_tokens/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/pot_data/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/2stud_data/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/MIND_data/ps_data/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy_decon.yaml b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy_decon.yaml new file mode 100644 index 000000000..42ed72632 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/mj_nanoanneals/tinyMATH/olmo3_4T-mjicroanneal_tinyMATH_allholy_decon.yaml @@ -0,0 +1,40 @@ +name: mjicroanneal-mj-tinyMATH_allholy_decon +description: mjicroanneal for the MIND/PoT of TinyMATH's 1-4 (minus the cursed sets) + DECON +budget: ai2/oe-base +workspace: ai2/olmo-3-microanneals +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 8_259_659_924 +global_batch_size: 2_097_152 +sequence_length: 8_192 +seed: 1_337 +model: olmo2_7B_swafix +tokenizer: dolma2 +priority: normal +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8_192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: web + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/6B/allenai/dolma2-tokenizer/*.npy + - name: SPECIFIC_HQ_DATA + target_ratio: 0.5 + paths: + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT/processed_data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/data/processed-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_2stud_merged-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH2/PoT_MIND_requests/MIND_ps_merged-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/2stud_data-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/MIND_data/ps_data-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH3/pot_data-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH4/pot_data2-decon-sparkle-motion/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/smoke-test-swafix-olmo3_7b-anneal.yaml b/src/cookbook/recipes/olmo3-midtraining/smoke-test-swafix-olmo3_7b-anneal.yaml new file mode 100644 index 000000000..830eb59b8 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/smoke-test-swafix-olmo3_7b-anneal.yaml @@ -0,0 +1,29 @@ +name: "olmo3-7b_microanneal-smoke-test" +description: "OLMo3 7b microanneal smoke test with swa fix" +budget: "ai2/oe-base" +workspace: "ai2/oe-data" +nodes: 2 +gpus: 8 +preemptible: true +max_tokens: 1_000_000_000 +global_batch_size: 2097152 +sequence_length: 4096 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step172000 +load_state: false +dataset: + sources: + - name: code + target_ratio: 1.0 + paths: + - gs://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/**/*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl-v2.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl-v2.yaml new file mode 100644 index 000000000..35571ae49 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl-v2.yaml @@ -0,0 +1,33 @@ +name: "olmo2-7b_10b-anneal_fim-stack-edu-hq-weighted-pl-dclm-v2" +description: "OLMo2 7b anneal to 10B Tokens for fim-stack-edu hq weighted-pl and dclm baseline" +budget: "ai2/oe-training" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 4096 +seed: 1337 +model: "olmo2_7B" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +annealing: + enabled: true + initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA +load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ +load_state: false +dataset: + sources: + - name: fim-stackedu-hq-v2-weighted-pl + target_ratio: 0.5 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v2/allenai/dolma2-tokenizer/**/*.npy + - name: dclm-baseline-olmo2 + target_ratio: 0.5 + paths: + - gs://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl.yaml new file mode 100644 index 000000000..7c521f171 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo2-10B-micro_fim-stack-edu-hq-weighted-pl.yaml @@ -0,0 +1,33 @@ +name: "olmo2-7b_10b-anneal_fim-stack-edu-hq-weighted-pl-dclm" +description: "OLMo2 7b anneal to 10B Tokens for fim-stack-edu hq weighted-pl and dclm baseline" +budget: "ai2/oe-training" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 4096 +seed: 1337 +model: "olmo2_7B" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/augusta-google-1 +rank_microbatch_size: 8192 +scheduler_type: linear +warmup_steps: 0 +annealing: + enabled: true + initial_lr: 6.135e-5 # See https://wandb.ai/ai2-llm/OLMo-2-1124-7B/reports/OLMo-2-7B-Nov-2024--VmlldzoxMDUzMzE1OA +load_path: gs://ai2-llm/checkpoints/shanea/OLMo-medium/peteish7/step928646/model_and_optim/ +load_state: false +dataset: + sources: + - name: fim-stackedu-hq-v2-weighted-pl + target_ratio: 0.5 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: dclm-baseline-olmo2 + target_ratio: 0.5 + paths: + - gs://ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-0*-*.npy diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-cmprs-filter.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-cmprs-filter.yaml new file mode 100644 index 000000000..f27aae1e0 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-cmprs-filter.yaml @@ -0,0 +1,69 @@ +name: "olmo3-7b-10B-all-sources-fim-stackedu-hq-filter-weighted-pl-v0-compress-filter" +description: "OLMo3 7b 10B with all sources + fim-stack-edu-hq-filter-weighted-pl-v0-compress-filter" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.452 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + # 19.9B + - name: code_fim-stackedu-hq-filter-v0-compress-filter + target_ratio: 0.198 + paths: + - s3://ai2-llm/preprocessed/stack-edu-fim/fim-weighted-pl-20B-decon_cmprs-filter-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-reweight-v1.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-reweight-v1.yaml new file mode 100644 index 000000000..ee45b1165 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl-reweight-v1.yaml @@ -0,0 +1,68 @@ +name: "olmo3-7b-10B-all-sources-fim-stackedu-hq-filter-weighted-pl-v0" +description: "OLMo3 7b 10B with all sources + fim-stack-edu-hq-filter-weighted-pl-v0" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.35 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code_fim-stackedu-hq-filter-v0 + target_ratio: 0.30 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl.yaml new file mode 100644 index 000000000..9caaf2ce1 --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_all-sources-fim-stackedu-hq-filter-weighted-pl.yaml @@ -0,0 +1,68 @@ +name: "olmo3-7b-10B-all-sources-fim-stackedu-hq-filter-weighted-pl-v0" +description: "OLMo3 7b 10B with all sources + fim-stack-edu-hq-filter-weighted-pl-v0" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: hqweb + target_ratio: 0.45 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy + - name: code_fim-stackedu-hq-filter-v0 + target_ratio: 0.2 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: finemath + # 10% less the .0195 over 10% for dolminos2math + target_ratio: 0.0806 + paths: + - gs://ai2-llm/preprocessed/finemath/finemath-3plus/allenai/dolma2-tokenizer/*.npy + - name: dolminos2math + target_ratio: 0.1194 + paths: + # 10.7B + - s3://ai2-llm/preprocessed/dolmino-math-1124-retok/dolma2-tokenizer/*.npy + # 1.25B + - s3://ai2-llm/preprocessed/midtraining-reasoning/mj_intermediate_math/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/MIND/allenai/dolma2-tokenizer/*.npy + - s3://ai2-llm/preprocessed/midtraining-reasoning/tinyMATH/PoT_tokens/allenai/dolma2-tokenizer/*.npy + - name: reddit + target_ratio: 0.089 + paths: + - gs://ai2-llm/pretraining-data/sources/reddit/dolma_raw/format_rewriting/densesub_highthresh_microanneal_4omini_rewrite_tokenized/*.npy + - name: instruction + target_ratio: 0.011 + paths: + - s3://ai2-llm/preprocessed/tulu-3-sft-for-olmo-3-midtraining/dolma2-tokenizer/tulu-3-midtrain-v0-data-simple-concat-with-new-line-with-generation-prompt/*.npy + - name: r1_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/big-reasoning-traces/allenai/dolma2-tokenizer/*.npy + - name: qwq_reasoning + target_ratio: 0.02375 + paths: + - s3://ai2-llm/preprocessed/thinking-data/qwq-traces/dolma2-tokenizer/*.npy + - name: gemini_reasoning + target_ratio: 0.0025 + paths: + - s3://ai2-llm/preprocessed/thinking-data/s1k-gemini-traces/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_fim-stack-edu-hq-weighted-pl.yaml b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_fim-stack-edu-hq-weighted-pl.yaml new file mode 100644 index 000000000..61766e23b --- /dev/null +++ b/src/cookbook/recipes/olmo3-midtraining/tm_anneals/olmo3_4T-10B-micro_fim-stack-edu-hq-weighted-pl.yaml @@ -0,0 +1,34 @@ +name: "olmo3-7b-10B-micro-fim-stack-edu-hq-weighted-pl-alldressed" +description: "OLMo3 7b anneal to 10B Tokens for fim-stack-edu hq weighted-pl-v0 + all dressed hq web" +budget: "ai2/oe-base" +workspace: "ai2/olmo-3-microanneals" +nodes: 4 +gpus: 8 +preemptible: true +max_tokens: 10_000_000_000 +global_batch_size: 2097152 +sequence_length: 8192 +seed: 1337 +model: "olmo2_7B_swafix" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +rank_microbatch_size: 16384 +scheduler_type: linear +warmup_steps: 0 +activation_checkpointing: true +annealing: + enabled: true +load_path: gs://ai2-llm/checkpoints/OLMo3-7B-swafix/step289000 +load_state: false +dataset: + sources: + - name: fim-stackedu-hq-v2-weighted-pl + target_ratio: 0.5 + paths: + - gs://ai2-llm/preprocessed/stack-edu-fim/weighted-pl-20B-v0/allenai/dolma2-tokenizer/**/*.npy + - name: hqweb + target_ratio: 0.5 + repetition_factor: 1.5 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3_subsamples/midtrain_pools/50B/allenai/dolma2-tokenizer/*.npy diff --git a/src/cookbook/utils/config.py b/src/cookbook/utils/config.py index ee8d6bf0a..85eb8c0ef 100644 --- a/src/cookbook/utils/config.py +++ b/src/cookbook/utils/config.py @@ -196,10 +196,7 @@ def build_train_config(config_path: Path, run_name: str, group_id: str, beaker_u trainer = config.trainer.build(train_module, data_loader) # If we have a load path and there is no checkpoint in the save folder, load the checkpoint from the load path. - if ( - not trainer.maybe_load_checkpoint(trainer.save_folder, load_trainer_state=base_config.load_state) - and base_config.load_path - ): + if not trainer.maybe_load_checkpoint(trainer.save_folder) and base_config.load_path: logger.info( f"Loading checkpoint from {base_config.load_path} and load_trainer_state: {base_config.load_state}" ) @@ -241,7 +238,7 @@ def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLa budget=group.config.budget or "ai2/oe-data", workspace=group.config.workspace, preemptible=group.config.preemptible, - beaker_image="petew/olmo-core-tch270cu126", + beaker_image="petew/olmo-core-tch270cu128", priority=group.config.priority, env_vars=[BeakerEnvVar(name="NCCL_DEBUG", value="INFO" if group.config.nccl_debug else "WARN")], env_secrets=[ @@ -253,6 +250,7 @@ def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLa BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), BeakerEnvSecret(name="GOOGLE_CLOUD_PROJECT", secret="GOOGLE_CLOUD_PROJECT"), ], + retries=3, setup_steps=[ 'git clone "$REPO_URL"', "conda shell.bash activate base", diff --git a/src/cookbook/utils/data.py b/src/cookbook/utils/data.py index c293b5c8f..ae2fba547 100644 --- a/src/cookbook/utils/data.py +++ b/src/cookbook/utils/data.py @@ -76,7 +76,7 @@ def get_token_counts_and_ratios( if scheme not in filesystems: filesystems[scheme] = get_filesystem_for_scheme(scheme) - with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: for source in source_configs: # Get the appropriate filesystem for this source scheme = next(iter({urlparse(path).scheme for path in source.paths}), "local")