Open-Athena · gonzalobenegas · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/README.md b/README.md
@@ -73,6 +73,7 @@ make test-full
 
 ```bash
 uv run hf download songlab/gpn-animal-promoter-dataset --repo-type dataset --local-dir data/gpn-animal-promoter-dataset
+uv run hf download gonzalobenegas/Angiosperm_16_genomes_sharded --repo-type dataset --local-dir data/gonzalobenegas/Angiosperm_16_genomes_sharded
 ```
 
 ## How to run

diff --git a/configs/data/plants.yaml b/configs/data/plants.yaml
@@ -2,7 +2,7 @@ defaults:
   - default
 
 # Training dataset: Angiosperm 16 genomes
-dataset_name: kuleshov-group/Angiosperm_16_genomes
+dataset_name: data/gonzalobenegas/Angiosperm_16_genomes_sharded
 
 # Batch size configuration
 batch_size: 2048 # Total effective batch size

diff --git a/configs/experiment/plants_clm_transformer_base.yaml b/configs/experiment/plants_clm_transformer_base.yaml
@@ -0,0 +1,31 @@
+# @package _global_
+
+# to execute this experiment run:
+# python glm_experiments/train.py experiment=transformer_base
+
+defaults:
+  - override /data: plants
+  - override /model: clm_transformer_base
+  - override /trainer: gpn_animal_promoter
+
+logger:
+  wandb:
+    name: experiment-plants-clm-transformer-base
+    tags: ["experiment", "plants", "clm", "transformer", "base"]
+
+data:
+  _target_: glm_experiments.data.lm_datamodule.CLMDataModule
+  per_device_batch_size: 256
+
+model:
+  scheduler:
+    _target_: transformers.get_cosine_with_min_lr_schedule_with_warmup
+    _partial_: true
+    num_warmup_steps: 2000
+    num_training_steps: ${trainer.max_steps}
+    min_lr_rate: 0.1 # Decay to 10% of max lr
+
+trainer:
+  max_steps: 20000
+  log_every_n_steps: 1000
+  val_check_interval: 1000
diff --git a/experiments/training_data/shard_dataset.py b/experiments/training_data/shard_dataset.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "pandas",
+#     "huggingface_hub",
+#     "tqdm",
+#     "zstandard",
+# ]
+# ///
+"""
+Download and shard the Angiosperm_16_genomes dataset.
+
+Downloads the dataset from kuleshov-group/Angiosperm_16_genomes and creates
+64 shards per split in jsonl.zst format.
+
+Usage:
+    uv run shard_dataset.py [--output-dir OUTPUT_DIR] [--n-shards N_SHARDS]
+"""
+
+import argparse
+from pathlib import Path
+
+import pandas as pd
+from huggingface_hub import hf_hub_download
+from tqdm import tqdm
+import numpy as np
+
+
+def download_split(repo_id: str, split: str, cache_dir: Path) -> Path:
+    """Download a single split from HuggingFace."""
+    filename = f"data/{split}/{split}.jsonl.zst"
+    return Path(
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=cache_dir,
+        )
+    )
+
+
+def shard_split(
+    input_path: Path,
+    output_dir: Path,
+    split: str,
+    n_shards: int,
+    seed: int = 42,
+) -> None:
+    """Load a split and create sharded output files."""
+    print(f"Loading {split} split from {input_path}...")
+    df = pd.read_json(input_path, lines=True)
+    print(f"  Loaded {len(df):,} rows")
+
+    # Shuffle the data
+    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
+
+    # Create output directory
+    split_dir = output_dir / split
+    split_dir.mkdir(parents=True, exist_ok=True)
+
+    # Split into shards and save
+    print(f"Writing {n_shards} shards...")
+    shards = np.array_split(df, n_shards)
+    for i, df_shard in enumerate(tqdm(shards, desc=f"Sharding {split}")):
+        shard_path = split_dir / f"shard_{i:04d}.jsonl.zst"
+        df_shard.to_json(
+            shard_path,
+            orient="records",
+            lines=True,
+            compression={"method": "zstd", "threads": -1},
+        )
+
+    print(f"  Wrote {n_shards} shards to {split_dir}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download and shard Angiosperm_16_genomes dataset"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("sharded_data"),
+        help="Output directory for sharded files (default: sharded_data)",
+    )
+    parser.add_argument(
+        "--n-shards",
+        type=int,
+        default=64,
+        help="Number of shards per split (default: 64)",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=None,
+        help="Cache directory for HuggingFace downloads",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for shuffling (default: 42)",
+    )
+    args = parser.parse_args()
+
+    repo_id = "kuleshov-group/Angiosperm_16_genomes"
+    splits = ["train", "valid", "test"]
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    for split in splits:
+        print(f"\n{'='*60}")
+        print(f"Processing {split} split")
+        print(f"{'='*60}")
+
+        # Download
+        input_path = download_split(repo_id, split, args.cache_dir)
+
+        # Shard
+        shard_split(
+            input_path=input_path,
+            output_dir=args.output_dir,
+            split=split,
+            n_shards=args.n_shards,
+            seed=args.seed,
+        )
+
+    print(f"\nDone! Sharded data written to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/training_data/upload_dataset.py b/experiments/training_data/upload_dataset.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "huggingface_hub",
+#     "tqdm",
+# ]
+# ///
+"""
+Upload sharded dataset to HuggingFace Hub.
+
+Uploads the sharded Angiosperm_16_genomes dataset created by shard_dataset.py
+to a new HuggingFace dataset repository.
+
+Usage:
+    uv run upload_dataset.py [--input-dir INPUT_DIR] [--repo-id REPO_ID]
+"""
+
+import argparse
+from pathlib import Path
+
+from huggingface_hub import HfApi, create_repo
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Upload sharded dataset to HuggingFace Hub"
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=Path,
+        default=Path("sharded_data"),
+        help="Input directory with sharded files (default: sharded_data)",
+    )
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        default="gonzalobenegas/Angiosperm_16_genomes_sharded",
+        help="HuggingFace repo ID (default: gonzalobenegas/Angiosperm_16_genomes_sharded)",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Make the repository private",
+    )
+    args = parser.parse_args()
+
+    if not args.input_dir.exists():
+        raise FileNotFoundError(f"Input directory not found: {args.input_dir}")
+
+    api = HfApi()
+
+    # Create the repository if it doesn't exist
+    print(f"Creating/checking repository: {args.repo_id}")
+    create_repo(
+        repo_id=args.repo_id,
+        repo_type="dataset",
+        exist_ok=True,
+        private=args.private,
+    )
+
+    # Upload each split
+    splits = ["train", "valid", "test"]
+    for split in splits:
+        split_dir = args.input_dir / split
+        if not split_dir.exists():
+            print(f"Warning: Split directory not found: {split_dir}, skipping")
+            continue
+
+        shard_files = sorted(split_dir.glob("shard_*.jsonl.zst"))
+        if not shard_files:
+            print(f"Warning: No shard files found in {split_dir}, skipping")
+            continue
+
+        print(f"\nUploading {split} split ({len(shard_files)} shards)...")
+
+        # Upload the entire split directory
+        api.upload_folder(
+            folder_path=str(split_dir),
+            path_in_repo=f"data/{split}",
+            repo_id=args.repo_id,
+            repo_type="dataset",
+            commit_message=f"Upload {split} split ({len(shard_files)} shards)",
+        )
+        print(f"  Uploaded {split} split")
+
+    print(f"\nDone! Dataset uploaded to: https://huggingface.co/datasets/{args.repo_id}")
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -73,6 +73,7 @@ make test-full @@
     ```bash
     uv run hf download songlab/gpn-animal-promoter-dataset --repo-type dataset --local-dir data/gpn-animal-promoter-dataset
+    uv run hf download gonzalobenegas/Angiosperm_16_genomes_sharded --repo-type dataset --local-dir data/gonzalobenegas/Angiosperm_16_genomes_sharded
     ```
     ## How to run
@@ Expand Down @@