Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ make test-full

```bash
uv run hf download songlab/gpn-animal-promoter-dataset --repo-type dataset --local-dir data/gpn-animal-promoter-dataset
uv run hf download gonzalobenegas/Angiosperm_16_genomes_sharded --repo-type dataset --local-dir data/gonzalobenegas/Angiosperm_16_genomes_sharded
```

## How to run
Expand Down
2 changes: 1 addition & 1 deletion configs/data/plants.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defaults:
- default

# Training dataset: Angiosperm 16 genomes
dataset_name: kuleshov-group/Angiosperm_16_genomes
dataset_name: data/gonzalobenegas/Angiosperm_16_genomes_sharded

# Batch size configuration
batch_size: 2048 # Total effective batch size
Expand Down
31 changes: 31 additions & 0 deletions configs/experiment/plants_clm_transformer_base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# @package _global_

# to execute this experiment run:
# python glm_experiments/train.py experiment=transformer_base

defaults:
- override /data: plants
- override /model: clm_transformer_base
- override /trainer: gpn_animal_promoter

logger:
wandb:
name: experiment-plants-clm-transformer-base
tags: ["experiment", "plants", "clm", "transformer", "base"]

data:
_target_: glm_experiments.data.lm_datamodule.CLMDataModule
per_device_batch_size: 256

model:
scheduler:
_target_: transformers.get_cosine_with_min_lr_schedule_with_warmup
_partial_: true
num_warmup_steps: 2000
num_training_steps: ${trainer.max_steps}
min_lr_rate: 0.1 # Decay to 10% of max lr

trainer:
max_steps: 20000
log_every_n_steps: 1000
val_check_interval: 1000
133 changes: 133 additions & 0 deletions experiments/training_data/shard_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pandas",
# "huggingface_hub",
# "tqdm",
# "zstandard",
# ]
# ///
"""
Download and shard the Angiosperm_16_genomes dataset.

Downloads the dataset from kuleshov-group/Angiosperm_16_genomes and creates
64 shards per split in jsonl.zst format.

Usage:
uv run shard_dataset.py [--output-dir OUTPUT_DIR] [--n-shards N_SHARDS]
"""

import argparse
from pathlib import Path

import pandas as pd
from huggingface_hub import hf_hub_download
from tqdm import tqdm
import numpy as np


def download_split(repo_id: str, split: str, cache_dir: Path) -> Path:
"""Download a single split from HuggingFace."""
filename = f"data/{split}/{split}.jsonl.zst"
return Path(
hf_hub_download(
repo_id=repo_id,
filename=filename,
repo_type="dataset",
cache_dir=cache_dir,
)
)


def shard_split(
input_path: Path,
output_dir: Path,
split: str,
n_shards: int,
seed: int = 42,
) -> None:
"""Load a split and create sharded output files."""
print(f"Loading {split} split from {input_path}...")
df = pd.read_json(input_path, lines=True)
print(f" Loaded {len(df):,} rows")

# Shuffle the data
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

# Create output directory
split_dir = output_dir / split
split_dir.mkdir(parents=True, exist_ok=True)

# Split into shards and save
print(f"Writing {n_shards} shards...")
shards = np.array_split(df, n_shards)
for i, df_shard in enumerate(tqdm(shards, desc=f"Sharding {split}")):
shard_path = split_dir / f"shard_{i:04d}.jsonl.zst"
df_shard.to_json(
shard_path,
orient="records",
lines=True,
compression={"method": "zstd", "threads": -1},
)

print(f" Wrote {n_shards} shards to {split_dir}")


def main():
parser = argparse.ArgumentParser(
description="Download and shard Angiosperm_16_genomes dataset"
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("sharded_data"),
help="Output directory for sharded files (default: sharded_data)",
)
parser.add_argument(
"--n-shards",
type=int,
default=64,
help="Number of shards per split (default: 64)",
)
parser.add_argument(
"--cache-dir",
type=Path,
default=None,
help="Cache directory for HuggingFace downloads",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for shuffling (default: 42)",
)
args = parser.parse_args()

repo_id = "kuleshov-group/Angiosperm_16_genomes"
splits = ["train", "valid", "test"]

args.output_dir.mkdir(parents=True, exist_ok=True)

for split in splits:
print(f"\n{'='*60}")
print(f"Processing {split} split")
print(f"{'='*60}")

# Download
input_path = download_split(repo_id, split, args.cache_dir)

# Shard
shard_split(
input_path=input_path,
output_dir=args.output_dir,
split=split,
n_shards=args.n_shards,
seed=args.seed,
)

print(f"\nDone! Sharded data written to {args.output_dir}")


if __name__ == "__main__":
main()
91 changes: 91 additions & 0 deletions experiments/training_data/upload_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "huggingface_hub",
# "tqdm",
# ]
# ///
"""
Upload sharded dataset to HuggingFace Hub.

Uploads the sharded Angiosperm_16_genomes dataset created by shard_dataset.py
to a new HuggingFace dataset repository.

Usage:
uv run upload_dataset.py [--input-dir INPUT_DIR] [--repo-id REPO_ID]
"""

import argparse
from pathlib import Path

from huggingface_hub import HfApi, create_repo


def main():
parser = argparse.ArgumentParser(
description="Upload sharded dataset to HuggingFace Hub"
)
parser.add_argument(
"--input-dir",
type=Path,
default=Path("sharded_data"),
help="Input directory with sharded files (default: sharded_data)",
)
parser.add_argument(
"--repo-id",
type=str,
default="gonzalobenegas/Angiosperm_16_genomes_sharded",
help="HuggingFace repo ID (default: gonzalobenegas/Angiosperm_16_genomes_sharded)",
)
parser.add_argument(
"--private",
action="store_true",
help="Make the repository private",
)
args = parser.parse_args()

if not args.input_dir.exists():
raise FileNotFoundError(f"Input directory not found: {args.input_dir}")

api = HfApi()

# Create the repository if it doesn't exist
print(f"Creating/checking repository: {args.repo_id}")
create_repo(
repo_id=args.repo_id,
repo_type="dataset",
exist_ok=True,
private=args.private,
)

# Upload each split
splits = ["train", "valid", "test"]
for split in splits:
split_dir = args.input_dir / split
if not split_dir.exists():
print(f"Warning: Split directory not found: {split_dir}, skipping")
continue

shard_files = sorted(split_dir.glob("shard_*.jsonl.zst"))
if not shard_files:
print(f"Warning: No shard files found in {split_dir}, skipping")
continue

print(f"\nUploading {split} split ({len(shard_files)} shards)...")

# Upload the entire split directory
api.upload_folder(
folder_path=str(split_dir),
path_in_repo=f"data/{split}",
repo_id=args.repo_id,
repo_type="dataset",
commit_message=f"Upload {split} split ({len(shard_files)} shards)",
)
print(f" Uploaded {split} split")

print(f"\nDone! Dataset uploaded to: https://huggingface.co/datasets/{args.repo_id}")


if __name__ == "__main__":
main()