-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_contents.py
More file actions
55 lines (43 loc) · 1.68 KB
/
create_contents.py
File metadata and controls
55 lines (43 loc) · 1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
import os
import time
from pyarrow.parquet import ParquetFile
KiB = 1024
MiB = 1024 * 1024
GiB = 1024 * 1024 * 1024
txt_size = float("inf")
parquet_path = "/path/to/repo/data/the-stack-64M.parquet"
contents_path = f"/path/to/repo/data/the-stack-64M-contents.txt"
contents_index_path = f"/path/to/repo/data/the-stack-64M-contents-index.json"
if __name__ == "__main__":
print(f"Starting at {time.asctime()}")
print(
f"Reading from {parquet_path}, writing to {contents_path} and {contents_index_path}"
)
if os.path.exists(contents_path):
print(f"File {contents_path} already exists")
exit()
print(f"Building the file {contents_path}")
sha_sizes = {} # made of { sha: (start_index, size) }
tot_put_time = 0
with open(contents_path, "a+") as f:
tot_size = 0
pf = ParquetFile(parquet_path)
for batch in pf.iter_batches(columns=["hexsha", "size", "content"]):
for i in range(len(batch["hexsha"])):
sha = str(batch["hexsha"][i])
content = str(batch["content"][i])
size = int(str(batch["size"][i]))
sha_sizes[sha] = (tot_size, size)
tot_size += size
start_put = time.time()
f.write(content)
end_put = time.time()
tot_put_time += end_put - start_put
print(f"Building the file {contents_index_path}")
with open(contents_index_path, "w") as f:
f.write(json.dumps(sha_sizes, indent=4))
print(
f"Total time to write {round(tot_put_time, 3)} s, {round((tot_size / MiB) / tot_put_time, 3)} MiB/s"
)
print(f"Ending at {time.asctime()}")