Skip to content

Commit cd24f50

Browse files
authored
Merge pull request #18 from Learn2RAG/feature/align_with_importer
clean up json loader, use content_hash instead of chunk_hash
2 parents 8fc9fb0 + 3fc61a2 commit cd24f50

File tree

5 files changed

+34
-135
lines changed

5 files changed

+34
-135
lines changed

learn2rag/pipeline/data/load_kcenter.py renamed to learn2rag/pipeline/data/load_kcenter_dump.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
from pathlib import Path
44
from bs4 import BeautifulSoup
5+
import hashlib
56

67
root = Path("/home/large-file-storage/download-uknowit")
78
paths = list(root.rglob("raw-document.json"))
@@ -18,16 +19,19 @@
1819
html_content = raw.get("content", "")
1920
soup = BeautifulSoup(html_content, "html.parser")
2021
content = soup.get_text(separator=" ", strip=True)
22+
content_hash = hashlib.sha256(content.encode('utf-8')).hexdigest()
2123

2224
new = {
2325
"metadata": {
2426
"source": raw.get("id"),
25-
"content_hash": "",
27+
"content_hash": content_hash,
2628
"source_path": raw['categories'][0]['branchText'],
2729
"file_extension": "",
2830
"process_date": "",
2931
"process_time": "",
3032
"loader_type": "KCenterFormatter",
33+
"loader_id": "my_kcenter_dump",
34+
"document_id": document.get("docId"),
3135
"title": raw.get("title"),
3236
"summary": raw.get("summary"),
3337
"keywords": document.get("keywords"),

learn2rag/pipeline/ingestion.py

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue, SparseVector, VectorParams, MultiVectorConfig, MultiVectorComparator, Distance
1414

1515

16-
from . import loaders
16+
from . import json_loader
1717
from .embeddings import create_embeddings
1818

1919

@@ -29,11 +29,12 @@ def get_chunks_metadata(chunks: list[Document], item: str) -> Iterator[str]:
2929
logging.warning('%d out of %d chunks are missing "%s" in metadata; using empty string', missing, len(chunks), item)
3030

3131

32-
def point_exists(qdrant: Qdrant, collection_name: str, path: str, chunk_hash:str) -> bool:
32+
def point_exists(qdrant: Qdrant, collection_name: str, loader_id: str, path: str, content_hash:str) -> bool:
3333
filter = Filter(
3434
must=[
35+
FieldCondition(key="loader_id", match=MatchValue(value=loader_id)),
3536
FieldCondition(key="path", match=MatchValue(value=path)),
36-
FieldCondition(key="content_hash", match=MatchValue(value=chunk_hash)),
37+
FieldCondition(key="content_hash", match=MatchValue(value=content_hash)),
3738
]
3839
)
3940
result, _ = qdrant.client.scroll(
@@ -96,27 +97,23 @@ def payload(sample: dict[str, Any]) -> dict[str, str]:
9697
return {
9798
"content": sample["page_content"],
9899
"path": sample["metadata"]["source"],
99-
"content_hash": sample["chunk_hash"],
100+
"content_hash": sample["metadata"]["content_hash"],
100101
"title": sample["metadata"].get("title",""),
101102
"uri": sample["metadata"].get("uri",""),
102103
"loader_id": sample["metadata"]["loader_id"],
103104
"document_id": sample["metadata"].get("document_id", "")
104105
}
105106

106107
def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
107-
# TODO: enable list of file paths in loader and adapt user_config
108-
# Load the documents from pdf
109-
# all_documents = loaders.sync_pdf_loader(user_config["file_path"])
110-
# TODO: use ifdt loader to load pdf in json, then:
111108
logging.info('Loading documents')
112-
all_documents = loaders.json_loader(user_config['imported_documents_file_path'])
109+
all_documents = json_loader.json_loader(user_config['imported_documents_file_path'])
113110

114111
# Split documents into chunks
115112
logging.info('Splitting documents into chunks')
116113
text_splitter = RecursiveCharacterTextSplitter(
117114
chunk_size=opt_config["chunk_size"], chunk_overlap=opt_config["chunk_overlap"]
118115
)
119-
chunks = text_splitter.split_documents(all_documents)
116+
chunks = text_splitter.split_documents(all_documents)[:2]
120117

121118
collection_name = user_config["collection_name"]
122119

@@ -126,7 +123,6 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
126123
opt_config=opt_config
127124
)
128125

129-
130126
chunks_content = [chunk.page_content for chunk in chunks]
131127
if len(opt_config["multi_search"]) > 0 and opt_config["query_mode"] == "multi":
132128
chunks_metadata = {}
@@ -140,8 +136,6 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
140136
else:
141137
raise TypeError(f"dense_vecs must be np.ndarray, got {type(dense_vecs)}")
142138

143-
# TODO: hash if you want to monitore changes in metadata
144-
chunk_hash = [hashlib.md5(chunk.page_content.encode()).hexdigest() for chunk in chunks]
145139
# Todo: handle different vector lengths for batch encoding when using sparse vectors
146140

147141
logging.info('Creating embeddings...')
@@ -158,40 +152,38 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
158152
if isinstance(embeddings, dict) and "dense_vecs" in embeddings:
159153
if opt_config["search_mode"] == "dense":
160154
chunks_with_embeddings = [
161-
dict(chunk) | {"dense_vec": dense, "chunk_hash": c_hash}
162-
for chunk, dense, c_hash in zip(chunks, embeddings["dense_vecs"], chunk_hash)
155+
dict(chunk) | {"dense_vec": dense}
156+
for chunk, dense in zip(chunks, embeddings["dense_vecs"])
163157
]
164158
if opt_config["search_mode"] == "dense_sparse":
165159
chunks_with_embeddings = [
166160
dict(chunk)
167-
| {"dense_vec": dense, "lexical_weights": sparse, "chunk_hash": c_hash}
168-
for chunk, dense, sparse, c_hash in zip(
161+
| {"dense_vec": dense, "lexical_weights": sparse}
162+
for chunk, dense, sparse in zip(
169163
chunks,
170164
list(embeddings["dense_vecs"]),
171165
list(embeddings["lexical_weights"]),
172-
chunk_hash,
173166
)
174167
]
175168
if opt_config["search_mode"] == "dense_sparse_colbert":
176169
chunks_with_embeddings = [
177170
dict(chunk)
178-
| {"dense_vec": dense, "lexical_weights": sparse, "colbert_vecs": colbert, "chunk_hash": c_hash}
179-
for chunk, dense, sparse, colbert, c_hash in zip(
171+
| {"dense_vec": dense, "lexical_weights": sparse, "colbert_vecs": colbert}
172+
for chunk, dense, sparse, colbert in zip(
180173
chunks,
181174
list(embeddings["dense_vecs"]),
182175
list(embeddings["lexical_weights"]),
183176
list(embeddings['colbert_vecs']),
184-
chunk_hash,
185177
)
186178
]
187179
else:
188180
chunks_with_embeddings = [
189-
dict(chunk) | {"dense_vec": dense, "chunk_hash": c_hash}
190-
for chunk, dense, c_hash in zip(chunks, embeddings, chunk_hash)
181+
dict(chunk) | {"dense_vec": dense}
182+
for chunk, dense in zip(chunks, embeddings)
191183
]
192184

193185
for sample in chunks_with_embeddings:
194-
if not point_exists(qdrant, collection_name, sample['metadata']['source'], sample['chunk_hash']):
186+
if not point_exists(qdrant, collection_name, sample['metadata']['loader_id'], sample['metadata']['source'], sample['metadata']['content_hash']):
195187
if opt_config["search_mode"] == "dense_sparse":
196188
insert_dense_sparse(qdrant, collection_name, sample)
197189
elif opt_config["search_mode"] == "dense_sparse_colbert":

learn2rag/pipeline/json_loader.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from langchain_community.document_loaders import JSONLoader
2+
from langchain_core.documents import Document
3+
4+
5+
def json_loader(file_path: str) -> list[Document]:
6+
loader = JSONLoader(
7+
file_path,
8+
jq_schema=".[]",
9+
content_key="content",
10+
metadata_func=lambda record, meta: record.get("metadata", {}),
11+
)
12+
return loader.load()

learn2rag/pipeline/loaders.py

Lines changed: 0 additions & 109 deletions
This file was deleted.

learn2rag/pipeline/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from .config import user_config, opt_config
1919

20-
#ingestion.index(user_config, opt_config)
20+
ingestion.index(user_config, opt_config)
2121

2222
if opt_config["query_mode"] == "multi":
2323
# in query_mode 'multi' different querys for each vector in the multi-vector are allowed

0 commit comments

Comments
 (0)