-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge_chunks.py
More file actions
30 lines (26 loc) · 1.13 KB
/
merge_chunks.py
File metadata and controls
30 lines (26 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import json
import math
n = 5
for filename in os.listdir("jsons"):
if filename.endswith(".json"):
file_path = os.path.join("jsons", filename)
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
new_chunks = []
num_chunks = len(data['chunks'])
num_groups = math.ceil(num_chunks/n)
for i in range(num_groups):
start_idx = i * n
end_idx = min((i + 1) * n, num_chunks)
chunk_group = data['chunks'][start_idx: end_idx]
new_chunks.append({
"number": data['chunks'][0]['number'],
"title": chunk_group[0]['title'],
"start": chunk_group[0]['start'],
"end": chunk_group[-1]['end'],
"text": " ".join(c['text'] for c in chunk_group)
})
os.makedirs("newjsons", exist_ok=True)
with open(os.path.join("newjsons", filename), "w", encoding="utf-8") as jsonfile:
json.dump({"chunks": new_chunks, "text": data['text']}, jsonfile, indent=4)