-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchunk.py
More file actions
69 lines (51 loc) · 1.8 KB
/
chunk.py
File metadata and controls
69 lines (51 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from globals import BODY_FIELD, CHUNK_FIELD, ID
import json
import copy
import spacy.cli
from langchain.text_splitter import SpacyTextSplitter
from langchain.text_splitter import CharacterTextSplitter
MAX_SPACY_SIZE = 90000
spacy.cli.download("en_core_web_sm")
char_splitter = CharacterTextSplitter(
separator=". ",
chunk_size=MAX_SPACY_SIZE,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
spacy_splitter = SpacyTextSplitter(chunk_size=1000)
def chunk_text(text):
if len(text) > MAX_SPACY_SIZE:
texts_to_chunk = char_splitter.split_text(text=text)
else:
texts_to_chunk = [text]
chunked_texts = []
for t in texts_to_chunk:
if len(t) <= MAX_SPACY_SIZE:
chunked_texts += spacy_splitter.split_text(text=t)
return chunked_texts
def chunk_document(document):
text = document[BODY_FIELD]
chunked_documents = []
chunked_texts = chunk_text(text)
for i, chunk in enumerate(chunked_texts):
tmp_document = copy.deepcopy(document)
tmp_document[CHUNK_FIELD] = chunk
tmp_document[ID] = str(document[ID]) + "_" + str(i)
del tmp_document[BODY_FIELD]
chunked_documents.append(tmp_document)
return chunked_documents
def batch_chunk(documents):
chunked_documents = []
for document in documents:
chunked_documents += chunk_document(document)
return chunked_documents
def main():
input_file_name = "../data/solr_documents.json"
output_file_name = "../data/solr_documents_with_chunks.json"
with open(input_file_name) as input_file, open(output_file_name, "w") as output_file:
documents = json.load(input_file)
chunked_documents = batch_chunk(documents)
json.dump(chunked_documents, output_file)
if __name__ == "__main__":
main()