-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupload_dataset.py
More file actions
executable file
·89 lines (73 loc) · 2.49 KB
/
upload_dataset.py
File metadata and controls
executable file
·89 lines (73 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "huggingface_hub",
# "tqdm",
# ]
# ///
"""
Upload sharded dataset to HuggingFace Hub.
Uploads the sharded Angiosperm_16_genomes dataset created by shard_dataset.py
to a new HuggingFace dataset repository.
Usage:
uv run upload_dataset.py [--input-dir INPUT_DIR] [--repo-id REPO_ID]
"""
import argparse
from pathlib import Path
from huggingface_hub import HfApi, create_repo
def main():
parser = argparse.ArgumentParser(description="Upload sharded dataset to HuggingFace Hub")
parser.add_argument(
"--input-dir",
type=Path,
default=Path("sharded_data"),
help="Input directory with sharded files (default: sharded_data)",
)
parser.add_argument(
"--repo-id",
type=str,
default="gonzalobenegas/Angiosperm_16_genomes_sharded",
help="HuggingFace repo ID (default: gonzalobenegas/Angiosperm_16_genomes_sharded)",
)
parser.add_argument(
"--private",
action="store_true",
help="Make the repository private",
)
args = parser.parse_args()
if not args.input_dir.exists():
raise FileNotFoundError(f"Input directory not found: {args.input_dir}")
api = HfApi()
# Create the repository if it doesn't exist
print(f"Creating/checking repository: {args.repo_id}")
create_repo(
repo_id=args.repo_id,
repo_type="dataset",
exist_ok=True,
private=args.private,
)
# Upload each split
splits = ["train", "valid", "test"]
for split in splits:
split_dir = args.input_dir / split
if not split_dir.exists():
print(f"Warning: Split directory not found: {split_dir}, skipping")
continue
shard_files = sorted(split_dir.glob("shard_*.jsonl.zst"))
if not shard_files:
print(f"Warning: No shard files found in {split_dir}, skipping")
continue
print(f"\nUploading {split} split ({len(shard_files)} shards)...")
# Upload the entire split directory
api.upload_folder(
folder_path=str(split_dir),
path_in_repo=f"data/{split}",
repo_id=args.repo_id,
repo_type="dataset",
commit_message=f"Upload {split} split ({len(shard_files)} shards)",
)
print(f" Uploaded {split} split")
print(f"\nDone! Dataset uploaded to: https://huggingface.co/datasets/{args.repo_id}")
if __name__ == "__main__":
main()