diff --git a/README.md b/README.md index e8d94b3..8cf1660 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ make test-full ```bash uv run hf download songlab/gpn-animal-promoter-dataset --repo-type dataset --local-dir data/gpn-animal-promoter-dataset +uv run hf download gonzalobenegas/Angiosperm_16_genomes_sharded --repo-type dataset --local-dir data/gonzalobenegas/Angiosperm_16_genomes_sharded ``` ## How to run diff --git a/configs/data/plants.yaml b/configs/data/plants.yaml index d527a47..d22c9cb 100644 --- a/configs/data/plants.yaml +++ b/configs/data/plants.yaml @@ -2,7 +2,7 @@ defaults: - default # Training dataset: Angiosperm 16 genomes -dataset_name: kuleshov-group/Angiosperm_16_genomes +dataset_name: data/gonzalobenegas/Angiosperm_16_genomes_sharded # Batch size configuration batch_size: 2048 # Total effective batch size diff --git a/configs/experiment/plants_clm_transformer_base.yaml b/configs/experiment/plants_clm_transformer_base.yaml new file mode 100644 index 0000000..bcd1094 --- /dev/null +++ b/configs/experiment/plants_clm_transformer_base.yaml @@ -0,0 +1,31 @@ +# @package _global_ + +# to execute this experiment run: +# python glm_experiments/train.py experiment=transformer_base + +defaults: + - override /data: plants + - override /model: clm_transformer_base + - override /trainer: gpn_animal_promoter + +logger: + wandb: + name: experiment-plants-clm-transformer-base + tags: ["experiment", "plants", "clm", "transformer", "base"] + +data: + _target_: glm_experiments.data.lm_datamodule.CLMDataModule + per_device_batch_size: 256 + +model: + scheduler: + _target_: transformers.get_cosine_with_min_lr_schedule_with_warmup + _partial_: true + num_warmup_steps: 2000 + num_training_steps: ${trainer.max_steps} + min_lr_rate: 0.1 # Decay to 10% of max lr + +trainer: + max_steps: 20000 + log_every_n_steps: 1000 + val_check_interval: 1000 diff --git a/experiments/training_data/shard_dataset.py b/experiments/training_data/shard_dataset.py new file mode 100755 index 0000000..028f6fa --- /dev/null +++ b/experiments/training_data/shard_dataset.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pandas", +# "huggingface_hub", +# "tqdm", +# "zstandard", +# ] +# /// +""" +Download and shard the Angiosperm_16_genomes dataset. + +Downloads the dataset from kuleshov-group/Angiosperm_16_genomes and creates +64 shards per split in jsonl.zst format. + +Usage: + uv run shard_dataset.py [--output-dir OUTPUT_DIR] [--n-shards N_SHARDS] +""" + +import argparse +from pathlib import Path + +import pandas as pd +from huggingface_hub import hf_hub_download +from tqdm import tqdm +import numpy as np + + +def download_split(repo_id: str, split: str, cache_dir: Path) -> Path: + """Download a single split from HuggingFace.""" + filename = f"data/{split}/{split}.jsonl.zst" + return Path( + hf_hub_download( + repo_id=repo_id, + filename=filename, + repo_type="dataset", + cache_dir=cache_dir, + ) + ) + + +def shard_split( + input_path: Path, + output_dir: Path, + split: str, + n_shards: int, + seed: int = 42, +) -> None: + """Load a split and create sharded output files.""" + print(f"Loading {split} split from {input_path}...") + df = pd.read_json(input_path, lines=True) + print(f" Loaded {len(df):,} rows") + + # Shuffle the data + df = df.sample(frac=1, random_state=seed).reset_index(drop=True) + + # Create output directory + split_dir = output_dir / split + split_dir.mkdir(parents=True, exist_ok=True) + + # Split into shards and save + print(f"Writing {n_shards} shards...") + shards = np.array_split(df, n_shards) + for i, df_shard in enumerate(tqdm(shards, desc=f"Sharding {split}")): + shard_path = split_dir / f"shard_{i:04d}.jsonl.zst" + df_shard.to_json( + shard_path, + orient="records", + lines=True, + compression={"method": "zstd", "threads": -1}, + ) + + print(f" Wrote {n_shards} shards to {split_dir}") + + +def main(): + parser = argparse.ArgumentParser( + description="Download and shard Angiosperm_16_genomes dataset" + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("sharded_data"), + help="Output directory for sharded files (default: sharded_data)", + ) + parser.add_argument( + "--n-shards", + type=int, + default=64, + help="Number of shards per split (default: 64)", + ) + parser.add_argument( + "--cache-dir", + type=Path, + default=None, + help="Cache directory for HuggingFace downloads", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for shuffling (default: 42)", + ) + args = parser.parse_args() + + repo_id = "kuleshov-group/Angiosperm_16_genomes" + splits = ["train", "valid", "test"] + + args.output_dir.mkdir(parents=True, exist_ok=True) + + for split in splits: + print(f"\n{'='*60}") + print(f"Processing {split} split") + print(f"{'='*60}") + + # Download + input_path = download_split(repo_id, split, args.cache_dir) + + # Shard + shard_split( + input_path=input_path, + output_dir=args.output_dir, + split=split, + n_shards=args.n_shards, + seed=args.seed, + ) + + print(f"\nDone! Sharded data written to {args.output_dir}") + + +if __name__ == "__main__": + main() diff --git a/experiments/training_data/upload_dataset.py b/experiments/training_data/upload_dataset.py new file mode 100755 index 0000000..e6edbf3 --- /dev/null +++ b/experiments/training_data/upload_dataset.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "huggingface_hub", +# "tqdm", +# ] +# /// +""" +Upload sharded dataset to HuggingFace Hub. + +Uploads the sharded Angiosperm_16_genomes dataset created by shard_dataset.py +to a new HuggingFace dataset repository. + +Usage: + uv run upload_dataset.py [--input-dir INPUT_DIR] [--repo-id REPO_ID] +""" + +import argparse +from pathlib import Path + +from huggingface_hub import HfApi, create_repo + + +def main(): + parser = argparse.ArgumentParser( + description="Upload sharded dataset to HuggingFace Hub" + ) + parser.add_argument( + "--input-dir", + type=Path, + default=Path("sharded_data"), + help="Input directory with sharded files (default: sharded_data)", + ) + parser.add_argument( + "--repo-id", + type=str, + default="gonzalobenegas/Angiosperm_16_genomes_sharded", + help="HuggingFace repo ID (default: gonzalobenegas/Angiosperm_16_genomes_sharded)", + ) + parser.add_argument( + "--private", + action="store_true", + help="Make the repository private", + ) + args = parser.parse_args() + + if not args.input_dir.exists(): + raise FileNotFoundError(f"Input directory not found: {args.input_dir}") + + api = HfApi() + + # Create the repository if it doesn't exist + print(f"Creating/checking repository: {args.repo_id}") + create_repo( + repo_id=args.repo_id, + repo_type="dataset", + exist_ok=True, + private=args.private, + ) + + # Upload each split + splits = ["train", "valid", "test"] + for split in splits: + split_dir = args.input_dir / split + if not split_dir.exists(): + print(f"Warning: Split directory not found: {split_dir}, skipping") + continue + + shard_files = sorted(split_dir.glob("shard_*.jsonl.zst")) + if not shard_files: + print(f"Warning: No shard files found in {split_dir}, skipping") + continue + + print(f"\nUploading {split} split ({len(shard_files)} shards)...") + + # Upload the entire split directory + api.upload_folder( + folder_path=str(split_dir), + path_in_repo=f"data/{split}", + repo_id=args.repo_id, + repo_type="dataset", + commit_message=f"Upload {split} split ({len(shard_files)} shards)", + ) + print(f" Uploaded {split} split") + + print(f"\nDone! Dataset uploaded to: https://huggingface.co/datasets/{args.repo_id}") + + +if __name__ == "__main__": + main()