ppc-swh-rocksdb/create_contents.py at main · 0xfederama/ppc-swh-rocksdb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
import os
import time

from pyarrow.parquet import ParquetFile

KiB = 1024
MiB = 1024 * 1024
GiB = 1024 * 1024 * 1024

txt_size = float("inf")

parquet_path = "/path/to/repo/data/the-stack-64M.parquet"
contents_path = f"/path/to/repo/data/the-stack-64M-contents.txt"
contents_index_path = f"/path/to/repo/data/the-stack-64M-contents-index.json"

if __name__ == "__main__":
    print(f"Starting at {time.asctime()}")

    print(
        f"Reading from {parquet_path}, writing to {contents_path} and {contents_index_path}"
    )

    if os.path.exists(contents_path):
        print(f"File {contents_path} already exists")
        exit()

    print(f"Building the file {contents_path}")
    sha_sizes = {}  # made of { sha: (start_index, size) }
    tot_put_time = 0

    with open(contents_path, "a+") as f:
        tot_size = 0
        pf = ParquetFile(parquet_path)
        for batch in pf.iter_batches(columns=["hexsha", "size", "content"]):
            for i in range(len(batch["hexsha"])):
                sha = str(batch["hexsha"][i])
                content = str(batch["content"][i])
                size = int(str(batch["size"][i]))
                sha_sizes[sha] = (tot_size, size)
                tot_size += size
                start_put = time.time()
                f.write(content)
                end_put = time.time()
                tot_put_time += end_put - start_put

    print(f"Building the file {contents_index_path}")
    with open(contents_index_path, "w") as f:
        f.write(json.dumps(sha_sizes, indent=4))

    print(
        f"Total time to write {round(tot_put_time, 3)} s, {round((tot_size / MiB) / tot_put_time, 3)} MiB/s"
    )

    print(f"Ending at {time.asctime()}")