-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_h5.py
More file actions
32 lines (25 loc) · 979 Bytes
/
Copy pathprocess_h5.py
File metadata and controls
32 lines (25 loc) · 979 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import h5py
import numpy as np
import glob
import sys
import os
from natsort import natsorted
data_dir = sys.argv[1] if len(sys.argv) > 1 else "."
for pattern, output in [("train*", "all_train"), ("valid*", "all_valid"), ("test*", "all_test")]:
files = natsorted(glob.glob(f"{data_dir}/{pattern}.h5"))
if not files:
continue
data_list, target_list = [], []
for f in files:
with h5py.File(f, 'r') as h5f:
data_list.append(h5f["sequence"][:])
target_list.append(h5f["target"][:])
with h5py.File(f"{data_dir}/{output}.h5", "w") as h5f_out:
h5f_out.create_dataset("sequence", data=np.concatenate(data_list))
h5f_out.create_dataset("target", data=np.concatenate(target_list))
print(f"Created {output}.h5 from {len(files)} files")
# Remove original files
for f in files:
os.remove(f)
print(f" Removed {os.path.basename(f)}")
print("\nCleanup complete!")